From 5bd3b9d06bed831f8153f2cd4d56f82ba21fc858 Mon Sep 17 00:00:00 2001 From: amancca Date: Fri, 5 Jun 2026 14:39:11 +0300 Subject: [PATCH] feat: initial polysearch - multi-engine search with proxy distribution, circuit breaker, REST API, and metrics --- .env.example | 11 ++ .gitignore | 5 + API.md | 281 ++++++++++++++++++++++++++++++ README.md | 115 +++++++++++++ benchmark.js | 200 ++++++++++++++++++++++ config.example.json | 39 +++++ package-lock.json | 182 ++++++++++++++++++++ package.json | 39 +++++ src/api-key.js | 51 ++++++ src/api.js | 242 ++++++++++++++++++++++++++ src/cli.js | 50 ++++++ src/config.js | 134 +++++++++++++++ src/engines/base.js | 25 +++ src/engines/duckduckgo.js | 177 +++++++++++++++++++ src/engines/index.js | 24 +++ src/engines/setup.js | 17 ++ src/http/client.js | 189 ++++++++++++++++++++ src/http/providers/index.js | 39 +++++ src/http/providers/oxylabs.js | 26 +++ src/http/providers/webshare.js | 45 +++++ src/http/proxy.js | 304 +++++++++++++++++++++++++++++++++ src/index.js | 160 +++++++++++++++++ src/output/agent.js | 133 +++++++++++++++ src/output/human.js | 72 ++++++++ src/run.js | 123 +++++++++++++ src/utils/logger.js | 23 +++ src/utils/retry.js | 45 +++++ src/utils/ua.js | 29 ++++ 28 files changed, 2780 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 API.md create mode 100644 README.md create mode 100644 benchmark.js create mode 100644 config.example.json create mode 100644 package-lock.json create mode 100644 package.json create mode 100644 src/api-key.js create mode 100644 src/api.js create mode 100644 src/cli.js create mode 100644 src/config.js create mode 100644 src/engines/base.js create mode 100644 src/engines/duckduckgo.js create mode 100644 src/engines/index.js create mode 100644 src/engines/setup.js create mode 100644 src/http/client.js create mode 100644 src/http/providers/index.js create mode 100644 src/http/providers/oxylabs.js create mode 100644 src/http/providers/webshare.js create mode 100644 src/http/proxy.js create mode 100755 src/index.js create mode 100644 src/output/agent.js create mode 100644 src/output/human.js create mode 100644 src/run.js create mode 100644 src/utils/logger.js create mode 100644 src/utils/retry.js create mode 100644 src/utils/ua.js diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..b2a8265 --- /dev/null +++ b/.env.example @@ -0,0 +1,11 @@ +# Webshare.io API key for automatic proxy fetching +# Get yours at https://www.webshare.io/user/api +WEBSHARE_API_KEY= + +# Oxylabs datacenter proxies (dc.oxylabs.io:8000) +# https://developers.oxylabs.io/proxies/datacenter-proxies +OXYLABS_USERNAME= +OXYLABS_PASSWORD= +OXYLABS_COUNTRY=US + +# Future: Add other proxy providers here (BrightData, Smartproxy, etc.) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1956bb5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +node_modules/ +.env +config.json +proxies.txt +*.log diff --git a/API.md b/API.md new file mode 100644 index 0000000..ed5e43f --- /dev/null +++ b/API.md @@ -0,0 +1,281 @@ +# PolySearch API + +A search API for AI agents. Submit queries, get structured results. Supports single and batch requests. + +**Base URL:** `http://:9876` + +**Auth:** All endpoints except `/health` require a Bearer token. + +```http +Authorization: Bearer +``` + +--- + +## Quick reference + +| Endpoint | Method | Auth | Purpose | +|----------|--------|------|---------| +| `/health` | GET | No | Ping the server | +| `/search` | POST | Yes | One query, one response | +| `/batch` | POST | Yes | Multiple queries, one response | +| `/metrics` | GET | Yes | Proxy pool health and usage | + +--- + +## Authentication + +The server operator provides an API key. Include it in every request (except `/health`): + +```bash +curl -H "Authorization: Bearer " http://localhost:9876/search ... +``` + +--- + +## `GET /health` + +Check if the server is running. + +```bash +curl http://localhost:9876/health +``` + +```json +{ + "success": true, + "status": "ok", + "uptime": 42.5, + "proxyPool": { + "total": 11, + "alive": 11, + "dead": 0, + "requestsTotal": 47, + "successRate": "91.5%", + "hourlyUsage": 12 + } +} +``` + +--- + +## `POST /search` + +Single search. Returns either image results, web results, or both. + +```bash +curl -X POST http://localhost:9876/search \ + -H "Authorization: Bearer " \ + -H "Content-Type: application/json" \ + -d '{"query": "mars rover", "type": "image", "limit": 3}' +``` + +### Request + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `query` | string | yes | — | What to search for | +| `type` | string | no | `image` | `image`, `web`, or `both` | +| `limit` | number | no | `10` | Max results per type (1–50) | + +### Response — Image results + +```json +{ + "success": true, + "query": "mars rover", + "type": "image", + "execution_time_ms": 3200, + "image": { + "engine": "duckduckgo", + "total": 3, + "results": [ + { + "index": 1, + "title": "NASA Perseverance Rover's Stunning Find...", + "image_url": "https://scitechdaily.com/images/...jpg", + "source_url": "https://scitechdaily.com/...", + "domain": "scitechdaily.com", + "width": 2560, + "height": 1818, + "thumbnail": "https://tse4.mm.bing.net/th?id=...", + "engine": "duckduckgo" + } + ], + "statistics": { + "avg_width": 3200, + "avg_height": 1989, + "domains": { + "scitechdaily.com": 1, + "static1.simpleflyingimages.com": 2 + } + } + } +} +``` + +Each image result contains: + +| Field | Type | Description | +|-------|------|-------------| +| `title` | string | Image description or caption | +| `image_url` | string | Direct URL to the image file | +| `source_url` | string | Page the image was found on | +| `domain` | string | Hosting domain | +| `width` | number | Image width (pixels) | +| `height` | number | Image height (pixels) | +| `thumbnail` | string | Thumbnail URL | +| `engine` | string | Search engine used | + +### Response — Web results + +When `type` is `web` or `both`: + +```json +{ + "web": { + "engine": "duckduckgo", + "total": 3, + "results": [ + { + "index": 1, + "title": "Quantum computing - Wikipedia", + "url": "https://en.wikipedia.org/wiki/Quantum_computing", + "domain": "en.wikipedia.org", + "snippet": "A quantum computer is a computer that exploits quantum mechanical phenomena...", + "engine": "duckduckgo" + } + ], + "statistics": { + "domains": { + "en.wikipedia.org": 1 + } + } + } +} +``` + +### Response — Error + +```json +{ + "success": false, + "error": { + "code": "ENGINE_FAILED", + "message": "All search engines returned errors", + "type": "Error" + }, + "query": "obscure term", + "timestamp": "2026-06-05T10:55:25.088Z" +} +``` + +--- + +## `POST /batch` + +Run 2–50 queries in a single request. Queries execute concurrently. Results return when all are complete. + +```bash +curl -X POST http://localhost:9876/batch \ + -H "Authorization: Bearer " \ + -H "Content-Type: application/json" \ + -d '{ + "queries": [ + {"query": "vintage radio", "type": "image", "limit": 2}, + {"query": "mars rover", "type": "image", "limit": 2}, + {"query": "aurora borealis","type": "image", "limit": 2} + ] + }' +``` + +### Request + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `queries` | array | yes | 2–50 query objects | + +Each query object: + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `query` | string | yes | — | Search term | +| `type` | string | no | `image` | `image`, `web`, or `both` | +| `limit` | number | no | `10` | Max results per type | + +### Response + +```json +{ + "success": true, + "batch_size": 3, + "execution_time_ms": 4511, + "ok": 3, + "fail": 0, + "results": [ + { + "index": 0, + "query": "vintage radio", + "type": "image", + "success": true, + "execution_time_ms": 4313, + "results": { ... full search response ... }, + "errors": [] + } + ], + "metrics": { + "hourly_usage": 7 + } +} +``` + +Each result mirrors the single `/search` response. Failed queries include an `error` field instead of `results`. + +Use `batch_size` to verify all queries were accepted, and `ok` / `fail` for a quick success count. + +--- + +## `GET /metrics` + +Proxy pool statistics. Useful for monitoring health and utilization. + +```bash +curl -H "Authorization: Bearer " http://localhost:9876/metrics +``` + +```json +{ + "success": true, + "metrics": { + "totalProxies": 11, + "alive": 11, + "dead": 0, + "circuitOpen": 0, + "requestsTotal": 47, + "successTotal": 43, + "failureTotal": 4, + "successRate": "91.5%", + "hourlyUsageCurrent": 12, + "byProvider": { + "webshare": { + "proxyCount": 10, + "alive": 10, + "requests": 47, + "success": 43, + "failure": 4, + "avgLatencyMs": 3604, + "successRate": "91.5%", + "hourlyUsage": { "currentHour": 12 } + } + } + } +} +``` + +--- + +## Notes + +- **Rate limits** depend on the proxy provider. The system distributes requests evenly and opens circuit breakers when proxies fail repeatedly. +- **Timeouts** vary. Image searches typically take 2–15 seconds depending on query and network conditions. +- **Max batch size** is 50 queries per request. diff --git a/README.md b/README.md new file mode 100644 index 0000000..6467ab7 --- /dev/null +++ b/README.md @@ -0,0 +1,115 @@ +# PolySearch + +Multi-engine web + image search with smart proxy distribution, circuit breakers, structured AI agent output, and a REST API. + +```bash +node src/index.js -q "quantum computing" -t both -l 10 -m agent +``` + +## Features + +- **Web + Image search** — both result types, one tool +- **Smart proxy distribution** — least-used selection per hour, balanced across providers +- **Circuit breaker per proxy** — exponential backoff on failure, auto-recovery +- **Multi-provider proxy system** — add Webshare, Oxylabs, BrightData in one file each +- **Multi-engine architecture** — add Brave, Bing, Google in one file each +- **Per-provider metrics** — requests, success rate, latency, hourly usage grouped by provider +- **Dual output modes**: + - `human` — colorized terminal + - `agent` — structured JSON with statistics +- **REST API** — single search, batch search, auth with API keys. See [API.md](API.md) + +## Requirements + +Node.js 18+ + +## Quick start + +```bash +# Single image search +node src/index.js -q "vintage radio" + +# Web search +node src/index.js -q "quantum computing" -t web + +# Both types, AI agent JSON +node src/index.js -q "spacex starship" -t both -l 10 -m agent + +# Show proxy metrics after a search +node src/index.js -q "mars rover" -M +``` + +## CLI + +| Flag | Long | Description | Default | +|------|------|-------------|---------| +| `-q` | `--query` | Search query | — | +| `-t` | `--type` | `web`, `image`, or `both` | `image` | +| `-l` | `--limit` | Max results per type | `10` | +| `-m` | `--mode` | `human` or `agent` | `human` | +| `-p` | `--proxy` | Single proxy URL override | — | +| `-c` | `--config` | Path to config file | auto-detect | +| `-M` | `--metrics` | Dump proxy pool metrics | — | +| | `--serve` | Start REST API server | — | +| | `--port` | API server port | `9876` | +| | `--generate-key` | Generate API key | — | +| `-h` | `--help` | Show help | — | + +## REST API + +For AI agent consumption. See [API.md](API.md) for full documentation. + +```bash +node src/index.js --generate-key # create an API key +node src/index.js --serve --port 9876 # start the server +``` + +**Endpoints:** `GET /health`, `POST /search`, `POST /batch`, `GET /metrics` + +--- + +## CLI + +Providers are auto-discovered from environment variables: + +| Provider | Env vars | Type | +|----------|----------|------| +| Webshare | `WEBSHARE_API_KEY` | API-fetched, 10 rotating IPs | +| Oxylabs | `OXYLABS_USERNAME`, `OXYLABS_PASSWORD`, `OXYLABS_COUNTRY` | Single datacenter endpoint | + +Add a new provider by creating a file in `src/http/providers/` that calls `registerProvider(name, fetcher)`. The fetcher returns an array of proxy URL strings. + +## Engine architecture + +Engines are registered in `src/engines/setup.js`. Each engine supports `web`, `image`, or both. DuckDuckGo is the default. Add Brave, Bing, or custom engines by implementing the `search(query, opts)` interface. + +## Project structure + +``` +src/ +├── index.js # CLI + programmatic API + API server dispatch +├── api.js # REST API server (/search, /batch, /metrics, /health) +├── api-key.js # Key generation + env storage +├── cli.js # Argument parsing +├── config.js # Config loader (json + env + providers) +├── run.js # Search orchestration + engine fallback +├── engines/ +│ ├── base.js # Abstract engine interface +│ ├── index.js # Engine registry +│ ├── setup.js # Built-in engine registration +│ └── duckduckgo.js # DuckDuckGo (web + image) +├── http/ +│ ├── client.js # Fetch wrapper (proxy, retry, timeout, UA) +│ ├── proxy.js # Proxy pool (least-used, circuit breaker, metrics) +│ └── providers/ +│ ├── index.js # Provider registry +│ ├── webshare.js # Webshare.io +│ └── oxylabs.js # Oxylabs datacenter +├── output/ +│ ├── human.js # Terminal formatting +│ └── agent.js # JSON formatting +└── utils/ + ├── logger.js # Pino structured logging + ├── retry.js # Exponential backoff + jitter + └── ua.js # User-agent pool +``` diff --git a/benchmark.js b/benchmark.js new file mode 100644 index 0000000..d47ecd2 --- /dev/null +++ b/benchmark.js @@ -0,0 +1,200 @@ +#!/usr/bin/env node +import { loadConfig } from "./src/config.js"; +import { HttpClient } from "./src/http/client.js"; +import { ProxyPool } from "./src/http/proxy.js"; +import { SearchRunner } from "./src/run.js"; +import { setUserAgents } from "./src/utils/ua.js"; +import { childLogger } from "./src/utils/logger.js"; + +const WEB_QUERIES = [ + "quantum computing", "machine learning", "renaissance art", "solar system", + "ancient rome", "climate change", "python programming", "space exploration", + "world war 2", "ocean depth", "artificial intelligence", "mount everest", + "greek mythology", "industrial revolution", "human genome", "black holes", + "coral reef", "buddhism history", "cold war", "mars colonization", + "electric vehicles", "great barrier reef", "dark matter", "dinosaur fossils", + "ancient egypt pyramids", "big bang theory", "amazon rainforest", + "vitamin deficiency", "stock market crash 1929", "northern lights" +]; + +const IMAGE_QUERIES = [ + "vintage radio", "mars rover", "aurora borealis", "mountain landscape", + "classic cars", "modern architecture", "street photography", "wild animals", + "space nebula", "underwater coral", "sunset beach", "city skyline", + "butterfly macro", "starry night sky", "tropical forest", "medieval castle", + "abstract art", "vintage motorcycles", "taj mahal", "rainforest waterfall", + "northern lights norway", "japanese garden", "safari animals", "galaxy cluster", + "old steam train", "desert dunes", "cherry blossom", "iceberg antarctica", + "neon city night", "autumn forest path" +]; + +function generateReport(results) { + const total = results.length; + const success = results.filter(r => r.success).length; + const failed = results.filter(r => !r.success).length; + const successRate = (success / total * 100).toFixed(1); + + const byType = {}; + for (const r of results) { + byType[r.type] = byType[r.type] || []; + byType[r.type].push(r); + } + + console.log("\n" + "=".repeat(80)); + console.log(" BENCHMARK REPORT"); + console.log("=".repeat(80)); + console.log(`\n Total requests: ${total}`); + console.log(` Successful: ${success} (${successRate}%)`); + console.log(` Failed: ${failed} (${(100 - parseFloat(successRate)).toFixed(1)}%)`); + console.log(` Date: ${new Date().toISOString()}`); + console.log(` Proxy pool: ${results[0]?.proxyCount || "N/A"} proxies`); + + for (const [type, items] of Object.entries(byType)) { + const tSuccess = items.filter(r => r.success).length; + const tFailed = items.filter(r => !r.success).length; + const times = items.filter(r => r.success).map(r => r.durationMs).sort((a, b) => a - b); + const dataSizes = items.filter(r => r.dataSizeKb != null).map(r => r.dataSizeKb).sort((a, b) => a - b); + + console.log(`\n ── ${type.toUpperCase()} (${items.length} requests, ${tSuccess} ok / ${tFailed} fail) ──`); + + if (times.length > 0) { + const avg = times.reduce((s, v) => s + v, 0) / times.length; + const p50 = times[Math.floor(times.length * 0.5)]; + const p95 = times[Math.floor(times.length * 0.95)]; + const p99 = times[Math.floor(times.length * 0.99)]; + const min = times[0]; + const max = times[times.length - 1]; + console.log(` Response time (ms): avg=${avg.toFixed(0)} p50=${p50} p95=${p95} p99=${p99} min=${min} max=${max}`); + } + + if (dataSizes.length > 0) { + const avg = dataSizes.reduce((s, v) => s + v, 0) / dataSizes.length; + const p50 = dataSizes[Math.floor(dataSizes.length * 0.5)]; + const p95 = dataSizes[Math.floor(dataSizes.length * 0.95)]; + console.log(` Data size (KB): avg=${avg.toFixed(2)} p50=${p50.toFixed(1)} p95=${p95.toFixed(1)}`); + } + + const engineCounts = {}; + items.filter(r => r.success).forEach(r => { + const key = r.engine || "unknown"; + engineCounts[key] = (engineCounts[key] || 0) + 1; + }); + if (Object.keys(engineCounts).length > 0) { + console.log(` Engines used: ${Object.entries(engineCounts).map(([k, v]) => `${k}=${v}`).join(", ")}`); + } + + const proxiesUsed = new Set(items.filter(r => r.success).map(r => r.proxyHost).filter(Boolean)); + const proxyFails = items.filter(r => !r.success).map(r => r.proxyHost).filter(Boolean); + if (proxiesUsed.size > 0) { + console.log(` Distinct proxies used: ${proxiesUsed.size}`); + console.log(` Proxy failures: ${proxyFails.length}`); + } + } + + const errorTypes = {}; + for (const r of results) { + if (!r.success && r.error) { + const key = r.error.substring(0, 60); + errorTypes[key] = (errorTypes[key] || 0) + 1; + } + } + if (Object.keys(errorTypes).length > 0) { + console.log(`\n ── Error Breakdown ──`); + for (const [err, count] of Object.entries(errorTypes).sort((a, b) => b[1] - a[1])) { + console.log(` [${count}x] ${err}`); + } + } + + console.log("=".repeat(80) + "\n"); +} + +async function runSingleTest(runner, query, type, index) { + const start = Date.now(); + try { + const data = await runner.run({ query, type, limit: 3 }); + const durationMs = Date.now() - start; + const results = data[type]?.results || []; + return { + index, + query, + type, + success: true, + durationMs, + dataSizeKb: JSON.stringify(data).length / 1024, + resultCount: results.length, + engine: data[type]?.engine || "none", + proxyHost: null, + errors: data.errors?.length || 0 + }; + } catch (err) { + return { + index, + query, + type, + success: false, + durationMs: Date.now() - start, + error: err.message, + resultCount: 0, + engine: "none", + proxyHost: null + }; + } +} + +async function main() { + const log = childLogger({ component: "benchmark" }); + + log.info({ webCount: WEB_QUERIES.length, imageCount: IMAGE_QUERIES.length }, "starting benchmark"); + + const config = await loadConfig(); + if (config.http.user_agents) setUserAgents(config.http.user_agents); + + const proxyPool = new ProxyPool(config.proxies, config.proxy); + const httpClient = new HttpClient(config.http); + + if (config.proxy.enabled) { + httpClient.setProxyPool(proxyPool); + log.info({ count: config.proxies.length }, "proxy pool attached"); + proxyPool.logState(); + } + + const runner = new SearchRunner({ httpClient, config }); + const allResults = []; + + const queries = [ + ...WEB_QUERIES.map(q => ({ query: q, type: "web" })), + ...IMAGE_QUERIES.map(q => ({ query: q, type: "image" })) + ]; + + const batchSize = 5; + for (let i = 0; i < queries.length; i += batchSize) { + const batch = queries.slice(i, i + batchSize); + log.info({ batch: Math.floor(i / batchSize) + 1, total: queries.length }, `running batch`); + + const batchResults = await Promise.all( + batch.map((item, j) => { + const idx = i + j + 1; + return runSingleTest(runner, item.query, item.type, idx); + }) + ); + + allResults.push(...batchResults); + + for (const r of batchResults) { + const icon = r.success ? "✓" : "✗"; + console.log(`${icon} [${r.type.padEnd(5)}] #${String(r.index).padStart(2)} "${r.query.slice(0, 30).padEnd(30)}" ${r.success ? `${r.durationMs}ms` : `FAIL: ${r.error?.slice(0, 40)}`}`); + } + } + + const totalDuration = allResults.reduce((s, r) => s + r.durationMs, 0); + log.info({ + totalRequests: allResults.length, + totalDurationMs: totalDuration, + avgDurationMs: Math.round(totalDuration / allResults.length), + successRate: `${(allResults.filter(r => r.success).length / allResults.length * 100).toFixed(1)}%` + }, "benchmark complete"); + + generateReport(allResults); +} + +main(); diff --git a/config.example.json b/config.example.json new file mode 100644 index 0000000..4aa2524 --- /dev/null +++ b/config.example.json @@ -0,0 +1,39 @@ +{ + "engines": { + "duckduckgo": { + "priority": 1, + "enabled": true, + "timeout_ms": 10000, + "retries": 2 + }, + "brave": { + "priority": 2, + "enabled": false, + "timeout_ms": 8000, + "retries": 1, + "api_key": "" + } + }, + "proxy": { + "enabled": false, + "rotation": "round-robin", + "health_check_interval_ms": 60000, + "max_failures": 3 + }, + "http": { + "timeout_ms": 10000, + "retry_max_attempts": 3, + "retry_base_delay_ms": 1000, + "retry_max_delay_ms": 10000, + "user_agents": [ + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + ] + }, + "search": { + "default_type": "image", + "default_limit": 10, + "max_limit": 50 + } +} diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..4a96c53 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,182 @@ +{ + "name": "image-search", + "version": "2.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "image-search", + "version": "2.0.0", + "license": "MIT", + "dependencies": { + "dotenv": "^17.4.2", + "pino": "^10.3.1", + "undici": "^7.27.1" + }, + "bin": { + "image-search": "src/index.js" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@pinojs/redact": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/@pinojs/redact/-/redact-0.4.0.tgz", + "integrity": "sha512-k2ENnmBugE/rzQfEcdWHcCY+/FM3VLzH9cYEsbdsoqrvzAKRhUZeRNhAZvB8OitQJ1TBed3yqWtdjzS6wJKBwg==", + "license": "MIT" + }, + "node_modules/atomic-sleep": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/atomic-sleep/-/atomic-sleep-1.0.0.tgz", + "integrity": "sha512-kNOjDqAh7px0XWNI+4QbzoiR/nTkHAWNud2uvnJquD1/x5a7EQZMJT0AczqK0Qn67oY/TTQ1LbUKajZpp3I9tQ==", + "license": "MIT", + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/dotenv": { + "version": "17.4.2", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.4.2.tgz", + "integrity": "sha512-nI4U3TottKAcAD9LLud4Cb7b2QztQMUEfHbvhTH09bqXTxnSie8WnjPALV/WMCrJZ6UV/qHJ6L03OqO3LcdYZw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, + "node_modules/on-exit-leak-free": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/on-exit-leak-free/-/on-exit-leak-free-2.1.2.tgz", + "integrity": "sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA==", + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/pino": { + "version": "10.3.1", + "resolved": "https://registry.npmjs.org/pino/-/pino-10.3.1.tgz", + "integrity": "sha512-r34yH/GlQpKZbU1BvFFqOjhISRo1MNx1tWYsYvmj6KIRHSPMT2+yHOEb1SG6NMvRoHRF0a07kCOox/9yakl1vg==", + "license": "MIT", + "dependencies": { + "@pinojs/redact": "^0.4.0", + "atomic-sleep": "^1.0.0", + "on-exit-leak-free": "^2.1.0", + "pino-abstract-transport": "^3.0.0", + "pino-std-serializers": "^7.0.0", + "process-warning": "^5.0.0", + "quick-format-unescaped": "^4.0.3", + "real-require": "^0.2.0", + "safe-stable-stringify": "^2.3.1", + "sonic-boom": "^4.0.1", + "thread-stream": "^4.0.0" + }, + "bin": { + "pino": "bin.js" + } + }, + "node_modules/pino-abstract-transport": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pino-abstract-transport/-/pino-abstract-transport-3.0.0.tgz", + "integrity": "sha512-wlfUczU+n7Hy/Ha5j9a/gZNy7We5+cXp8YL+X+PG8S0KXxw7n/JXA3c46Y0zQznIJ83URJiwy7Lh56WLokNuxg==", + "license": "MIT", + "dependencies": { + "split2": "^4.0.0" + } + }, + "node_modules/pino-std-serializers": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/pino-std-serializers/-/pino-std-serializers-7.1.0.tgz", + "integrity": "sha512-BndPH67/JxGExRgiX1dX0w1FvZck5Wa4aal9198SrRhZjH3GxKQUKIBnYJTdj2HDN3UQAS06HlfcSbQj2OHmaw==", + "license": "MIT" + }, + "node_modules/process-warning": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/process-warning/-/process-warning-5.0.0.tgz", + "integrity": "sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fastify" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/fastify" + } + ], + "license": "MIT" + }, + "node_modules/quick-format-unescaped": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/quick-format-unescaped/-/quick-format-unescaped-4.0.4.tgz", + "integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==", + "license": "MIT" + }, + "node_modules/real-require": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/real-require/-/real-require-0.2.0.tgz", + "integrity": "sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg==", + "license": "MIT", + "engines": { + "node": ">= 12.13.0" + } + }, + "node_modules/safe-stable-stringify": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.5.0.tgz", + "integrity": "sha512-b3rppTKm9T+PsVCBEOUR46GWI7fdOs00VKZ1+9c1EWDaDMvjQc6tUwuFyIprgGgTcWoVHSKrU8H31ZHA2e0RHA==", + "license": "MIT", + "engines": { + "node": ">=10" + } + }, + "node_modules/sonic-boom": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/sonic-boom/-/sonic-boom-4.2.1.tgz", + "integrity": "sha512-w6AxtubXa2wTXAUsZMMWERrsIRAdrK0Sc+FUytWvYAhBJLyuI4llrMIC1DtlNSdI99EI86KZum2MMq3EAZlF9Q==", + "license": "MIT", + "dependencies": { + "atomic-sleep": "^1.0.0" + } + }, + "node_modules/split2": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/split2/-/split2-4.2.0.tgz", + "integrity": "sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==", + "license": "ISC", + "engines": { + "node": ">= 10.x" + } + }, + "node_modules/thread-stream": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/thread-stream/-/thread-stream-4.2.0.tgz", + "integrity": "sha512-e2zZ96wSChazBsbENf/Pcm/4swHt2cEKQ92rhUjkL9GCKiTDJIaTBenjE/m9DXi0QBmTMDkFDdOomUy20A1tDQ==", + "license": "MIT", + "dependencies": { + "real-require": "^1.0.0" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/thread-stream/node_modules/real-require": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/real-require/-/real-require-1.0.0.tgz", + "integrity": "sha512-P4nbQYQfePJxRSmY+v/KINxVucm4NF3p3s7pJveMTtom52FR4YGltUQLB8idDXwDDWW+eYrWDFbuzUnjoWHF7g==", + "license": "MIT" + }, + "node_modules/undici": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.27.1.tgz", + "integrity": "sha512-UDdpiex+mzigiyrXrGbiUaF4HzTNhKbh2vRNFaTMzcqmLIPrZxaCtwo/1TMSuWoM1Xz3WiTo9KdgI3kRqYzJGg==", + "license": "MIT", + "engines": { + "node": ">=20.18.1" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..a7405a3 --- /dev/null +++ b/package.json @@ -0,0 +1,39 @@ +{ + "name": "polysearch", + "version": "2.0.0", + "description": "Multi-engine web + image search with proxy rotation, circuit breaker, and structured AI agent output", + "type": "module", + "main": "src/index.js", + "bin": { + "polysearch": "src/index.js" + }, + "scripts": { + "search": "node src/index.js", + "agent": "node src/index.js --mode agent", + "human": "node src/index.js --mode human", + "metrics": "node src/index.js -M", + "help": "node src/index.js --help" + }, + "keywords": [ + "polysearch", + "web-search", + "image-search", + "multi-engine", + "proxy", + "circuit-breaker", + "ai-agent", + "duckduckgo", + "oxylabs", + "webshare" + ], + "author": "", + "license": "MIT", + "engines": { + "node": ">=18.0.0" + }, + "dependencies": { + "dotenv": "^17.4.2", + "pino": "^10.3.1", + "undici": "^7.27.1" + } +} diff --git a/src/api-key.js b/src/api-key.js new file mode 100644 index 0000000..c9c6746 --- /dev/null +++ b/src/api-key.js @@ -0,0 +1,51 @@ +import { randomBytes } from "node:crypto"; +import { existsSync, readFileSync, writeFileSync } from "node:fs"; +import { resolve } from "node:path"; +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const PROJECT_ROOT = resolve(dirname(fileURLToPath(import.meta.url)), ".."); +const ENV_PATH = resolve(PROJECT_ROOT, ".env"); + +export function generateKey() { + return randomBytes(32).toString("hex"); +} + +export function saveKeyToEnv(key) { + if (!existsSync(ENV_PATH)) { + writeFileSync(ENV_PATH, `POLYSEARCH_API_KEY=${key}\n`); + return { path: ENV_PATH, created: true }; + } + + const raw = readFileSync(ENV_PATH, "utf-8"); + const lines = raw.split("\n"); + let replaced = false; + + const updated = lines.map(line => { + if (line.startsWith("POLYSEARCH_API_KEY=")) { + replaced = true; + return `POLYSEARCH_API_KEY=${key}`; + } + return line; + }); + + if (!replaced) { + updated.push(`POLYSEARCH_API_KEY=${key}`); + } + + writeFileSync(ENV_PATH, updated.join("\n")); + return { path: ENV_PATH, created: !replaced }; +} + +export function loadApiKey() { + return process.env.POLYSEARCH_API_KEY || null; +} + +export function requireApiKey() { + const key = loadApiKey(); + if (!key) { + console.error("No API key found. Generate one with: node src/index.js --generate-key"); + process.exit(1); + } + return key; +} diff --git a/src/api.js b/src/api.js new file mode 100644 index 0000000..bc03f6c --- /dev/null +++ b/src/api.js @@ -0,0 +1,242 @@ +import { createServer } from "node:http"; +import { loadConfig } from "./config.js"; +import { HttpClient } from "./http/client.js"; +import { ProxyPool } from "./http/proxy.js"; +import { SearchRunner } from "./run.js"; +import { setUserAgents } from "./utils/ua.js"; +import { loadApiKey } from "./api-key.js"; +import { logger, childLogger } from "./utils/logger.js"; +import { formatSearchResponse, formatErrorResponse } from "./output/agent.js"; + +const log = childLogger({ component: "api-server" }); + +function unauthorized(res, msg = "Unauthorized") { + res.writeHead(401, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ success: false, error: { code: "UNAUTHORIZED", message: msg } })); +} + +function badRequest(res, msg) { + res.writeHead(400, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ success: false, error: { code: "BAD_REQUEST", message: msg } })); +} + +function serverError(res, msg) { + if (res.headersSent) return; + res.writeHead(500, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ success: false, error: { code: "INTERNAL_ERROR", message: msg } })); +} + +function parseBody(req) { + return new Promise((resolve, reject) => { + let data = ""; + req.on("data", chunk => data += chunk); + req.on("end", () => { + try { resolve(JSON.parse(data)); } + catch { reject(new Error("Invalid JSON")); } + }); + req.on("error", reject); + }); +} + +function authenticate(req) { + const apiKey = loadApiKey(); + if (!apiKey) return true; + + const auth = req.headers["authorization"] || ""; + const token = auth.startsWith("Bearer ") ? auth.slice(7) : ""; + return token === apiKey; +} + +export async function startServer(port = 9876) { + const config = await loadConfig(); + if (config.http.user_agents) setUserAgents(config.http.user_agents); + + const httpClient = new HttpClient(config.http); + const proxyPool = new ProxyPool(config.proxies, config.proxy); + + if (config.proxy.enabled) { + httpClient.setProxyPool(proxyPool); + log.info({ proxyCount: config.proxies.length }, "proxy pool attached to API server"); + } + + const runner = new SearchRunner({ httpClient, config }); + + const server = createServer(async (req, res) => { + res.setHeader("Access-Control-Allow-Origin", "*"); + res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS"); + res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization"); + + if (req.method === "OPTIONS") { + res.writeHead(204); + res.end(); + return; + } + + try { + const url = new URL(req.url, `http://${req.headers.host || "localhost"}`); + const path = url.pathname; + + // Health check — no auth required + if (path === "/health" && req.method === "GET") { + res.writeHead(200, { "Content-Type": "application/json" }); + const m = proxyPool.getMetrics(); + res.end(JSON.stringify({ + success: true, + status: "ok", + uptime: process.uptime(), + proxyCount: config.proxies.length, + proxyPool: { + total: m.totalProxies, + alive: m.alive, + dead: m.dead, + circuitOpen: m.circuitOpen, + requestsTotal: m.requestsTotal, + successRate: m.successRate, + hourlyUsage: m.hourlyUsageCurrent + } + })); + return; + } + + // Metrics — requires auth + if (path === "/metrics" && req.method === "GET") { + if (!authenticate(req)) return unauthorized(res); + res.writeHead(200, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ + success: true, + metrics: proxyPool.getMetrics(), + proxies: proxyPool.getProxyDetail() + })); + return; + } + + // Auth required for search endpoints + if (!authenticate(req)) return unauthorized(res); + + if (path === "/search" && req.method === "POST") { + const body = await parseBody(req); + const { query, type = "image", limit = 10 } = body; + if (!query) return badRequest(res, "Missing 'query' field"); + + const start = Date.now(); + const data = await runner.run({ query, type, limit }); + const response = JSON.parse(formatSearchResponse(data)); + response.execution_time_ms = Date.now() - start; + + res.writeHead(data.image?.results?.length > 0 || data.web?.results?.length > 0 ? 200 : 404, { + "Content-Type": "application/json" + }); + res.end(JSON.stringify(response)); + return; + } + + if (path === "/batch" && req.method === "POST") { + const body = await parseBody(req); + const { queries = [] } = body; + + if (!Array.isArray(queries) || queries.length === 0) { + return badRequest(res, "Missing or empty 'queries' array"); + } + if (queries.length > 50) { + return badRequest(res, "Maximum 50 queries per batch"); + } + + const logBatch = childLogger({ component: "batch", batchSize: queries.length }); + + const batchStart = Date.now(); + logBatch.info("batch started"); + + const batchResults = await Promise.allSettled( + queries.map(async (q, i) => { + const qStart = Date.now(); + try { + const data = await runner.run({ + query: q.query || "", + type: q.type || "image", + limit: q.limit || 10 + }); + return { + index: i, + query: q.query, + type: q.type || "image", + success: true, + execution_time_ms: Date.now() - qStart, + results: data, + errors: data.errors || [] + }; + } catch (err) { + return { + index: i, + query: q.query, + type: q.type || "image", + success: false, + execution_time_ms: Date.now() - qStart, + error: err.message + }; + } + }) + ); + + const totalMs = Date.now() - batchStart; + const ok = batchResults.filter(r => r.status === "fulfilled" && r.value.success).length; + const fail = batchResults.filter(r => r.status === "fulfilled" && !r.value.success).length; + + logBatch.info({ total: queries.length, ok, fail, totalMs }, "batch completed"); + + res.writeHead(200, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ + success: true, + batch_size: queries.length, + execution_time_ms: totalMs, + ok, + fail, + results: batchResults.map(r => r.status === "fulfilled" ? r.value : { + index: -1, + query: "unknown", + success: false, + error: r.reason?.message || "Promise rejected" + }), + metrics: { + pool: proxyPool.getMetrics(), + hourly_usage: proxyPool.getMetrics().hourlyUsageCurrent + } + })); + return; + } + + // 404 + res.writeHead(404, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ success: false, error: { code: "NOT_FOUND", message: `No endpoint: ${req.method} ${path}` } })); + + } catch (err) { + log.error({ error: err.message }, "API error"); + serverError(res, err.message); + } + }); + + server.listen(port, () => { + log.info({ port }, "API server started"); + console.log(`\n PolySearch API running on http://localhost:${port}`); + console.log(` Health: http://localhost:${port}/health`); + console.log(` Search: POST http://localhost:${port}/search`); + console.log(` Batch: POST http://localhost:${port}/batch`); + console.log(` Metrics: GET http://localhost:${port}/metrics`); + const key = loadApiKey(); + if (key) { + console.log(` Auth: Authorization: Bearer `); + console.log(` Key: ${key.substring(0, 8)}...${key.slice(-4)}\n`); + } else { + console.log(` Auth: none (no API key configured)\n`); + } + }); + + const shutdown = () => { + log.info("shutting down API server"); + proxyPool.destroy(); + server.close(() => process.exit(0)); + }; + process.on("SIGINT", shutdown); + process.on("SIGTERM", shutdown); + + return server; +} diff --git a/src/cli.js b/src/cli.js new file mode 100644 index 0000000..2ca1137 --- /dev/null +++ b/src/cli.js @@ -0,0 +1,50 @@ +import { parseArgs } from "node:util"; + +export function parseCliArgs() { + const { values, positionals } = parseArgs({ + args: process.argv.slice(2), + options: { + query: { type: "string", short: "q" }, + type: { type: "string", short: "t", default: "image" }, + limit: { type: "string", short: "l", default: "10" }, + mode: { type: "string", short: "m", default: "human" }, + config: { type: "string", short: "c" }, + proxy: { type: "string", short: "p" }, + metrics: { type: "boolean", short: "M", default: false }, + "generate-key": { type: "boolean", default: false }, + serve: { type: "boolean", default: false }, + port: { type: "string", default: "9876" }, + help: { type: "boolean", short: "h", default: false } + }, + strict: false, + allowPositionals: true + }); + + const type = (values.type || "image").toLowerCase(); + const mode = (values.mode || "human").toLowerCase(); + + if (!["web", "image", "both"].includes(type)) { + console.error(`Invalid type "${values.type}". Must be: web, image, or both`); + process.exit(1); + } + + if (!["human", "agent"].includes(mode)) { + console.error(`Invalid mode "${values.mode}". Must be: human or agent`); + process.exit(1); + } + + return { + query: values.query || null, + type, + limit: Math.min(parseInt(values.limit, 10) || 10, 50), + mode, + configPath: values.config || null, + proxy: values.proxy || null, + metrics: values.metrics, + generateKey: values["generate-key"], + serve: values.serve, + port: parseInt(values.port, 10) || 9876, + help: values.help, + positionals + }; +} diff --git a/src/config.js b/src/config.js new file mode 100644 index 0000000..1561b43 --- /dev/null +++ b/src/config.js @@ -0,0 +1,134 @@ +import dotenv from "dotenv"; +import { existsSync, readFileSync } from "node:fs"; +import { resolve, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; +import { homedir } from "node:os"; +import { childLogger } from "./utils/logger.js"; +import { fetchAllProxies, listProviders } from "./http/providers/index.js"; +import "./http/providers/webshare.js"; +import "./http/providers/oxylabs.js"; + +const PROJECT_ROOT = resolve(dirname(fileURLToPath(import.meta.url)), ".."); + +dotenv.config({ path: resolve(PROJECT_ROOT, ".env") }); + +const DEFAULTS = { + engines: { + duckduckgo: { priority: 1, enabled: true, timeout_ms: 10000, retries: 2 } + }, + proxies: [], + proxy: { + enabled: false, + rotation: "round-robin", + health_check_interval_ms: 60000, + max_failures: 3 + }, + http: { + timeout_ms: 10000, + retry_max_attempts: 3, + retry_base_delay_ms: 1000, + retry_max_delay_ms: 10000, + user_agents: [] + }, + search: { + default_type: "image", + default_limit: 10, + max_limit: 50 + } +}; + +function mergeDeep(target, source) { + const result = { ...target }; + for (const key of Object.keys(source)) { + if (source[key] && typeof source[key] === "object" && !Array.isArray(source[key])) { + result[key] = mergeDeep(target[key] || {}, source[key]); + } else { + result[key] = source[key]; + } + } + return result; +} + +function findConfig() { + const paths = [ + process.env.POLYSEARCH_CONFIG || process.env.IMAGE_SEARCH_CONFIG, + resolve(PROJECT_ROOT, "config.json"), + resolve(PROJECT_ROOT, "config.example.json"), + resolve(homedir(), ".polysearch.json"), + resolve(homedir(), ".image-search.json") + ]; + + for (const p of paths) { + if (p && existsSync(p)) return p; + } + return null; +} + +function loadProxyFile() { + const proxyFile = resolve(PROJECT_ROOT, "proxies.txt"); + if (!existsSync(proxyFile)) return []; + + const raw = readFileSync(proxyFile, "utf-8"); + const proxies = []; + + for (const line of raw.split("\n")) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith("#")) continue; + + const parts = trimmed.split(":"); + if (parts.length === 4) { + const [host, port, user, pass] = parts; + proxies.push(`http://${encodeURIComponent(user)}:${encodeURIComponent(pass)}@${host}:${port}`); + } else if (parts.length === 2) { + proxies.push(`http://${parts[0]}:${parts[1]}`); + } + } + + return proxies; +} + +export async function loadConfig(configPath) { + const finalPath = configPath || findConfig(); + let config = { ...DEFAULTS }; + + if (finalPath && existsSync(finalPath)) { + try { + const raw = readFileSync(finalPath, "utf-8"); + const parsed = JSON.parse(raw); + config = mergeDeep(config, parsed); + } catch (err) { + console.error(`Warning: Failed to load config from ${finalPath}: ${err.message}`); + } + } + + const log = childLogger({ component: "config" }); + + const [providerProxies, fileProxies] = await Promise.all([ + fetchAllProxies().catch(err => { + log.warn({ error: err.message }, "all proxy providers failed"); + return []; + }), + Promise.resolve(loadProxyFile()) + ]); + + const allProxies = [...new Set([...providerProxies, ...fileProxies, ...config.proxies])]; + + if (allProxies.length > 0) { + config.proxies = allProxies; + if (!config.proxy.enabled) { + config.proxy.enabled = true; + } + } + + config.http.user_agents = config.http.user_agents.length > 0 + ? config.http.user_agents + : null; + + log.info({ + providers: listProviders().map(p => p.name), + proxyCount: allProxies.length, + proxyEnabled: config.proxy.enabled + }, "config loaded"); + + return config; +} diff --git a/src/engines/base.js b/src/engines/base.js new file mode 100644 index 0000000..bbf8c5f --- /dev/null +++ b/src/engines/base.js @@ -0,0 +1,25 @@ +export class SearchEngine { + static name = "base"; + static supports = []; + static priority = 10; + + constructor(httpClient) { + this.http = httpClient; + } + + async search(query, options = {}) { + throw new Error("search() must be implemented by subclass"); + } + + get supportedTypes() { + return this.constructor.supports; + } + + get engineName() { + return this.constructor.name; + } + + get priority() { + return this.constructor.priority; + } +} diff --git a/src/engines/duckduckgo.js b/src/engines/duckduckgo.js new file mode 100644 index 0000000..64aab74 --- /dev/null +++ b/src/engines/duckduckgo.js @@ -0,0 +1,177 @@ +import { SearchEngine } from "./base.js"; + +export class DuckDuckGo extends SearchEngine { + static name = "duckduckgo"; + static supports = ["web", "image"]; + static priority = 1; + + async search(query, options = {}) { + const { type = "image", limit = 10, proxy = null, signal = null } = options; + + if (type === "web") { + return this._webSearch(query, limit, proxy, signal); + } + return this._imageSearch(query, limit, proxy, signal); + } + + async _webSearch(query, limit, proxy, signal) { + const errors = []; + + for (const strategy of [this._webViaLite.bind(this), this._webViaHtml.bind(this)]) { + try { + const result = await strategy(query, limit, proxy, signal); + if (result && result.results && result.results.length > 0) { + this.http.log.debug({ strategy: strategy.name, count: result.results.length }, "web search strategy succeeded"); + return result; + } + } catch (err) { + const label = strategy.name || "unknown"; + this.http.log.warn({ strategy: label, error: err.message }, "web search strategy failed"); + errors.push({ strategy: label, error: err.message }); + } + } + + this.http.log.warn({ strategies: errors.length, query }, "all web search strategies exhausted"); + + if (errors.some(e => e.error.includes("RATE_LIMITED"))) { + throw new Error(`RATE_LIMITED: DuckDuckGo rejected all strategies. ${errors.map(e => e.error).join("; ")}`); + } + + return { results: [], engine: "duckduckgo", type: "web", total: 0 }; + } + + async _webViaLite(query, limit, proxy, signal) { + const html = await this.http.fetch( + `https://lite.duckduckgo.com/lite/?q=${encodeURIComponent(query)}`, + { responseType: "text", proxy, signal, retries: 1, timeoutMs: 8000 } + ); + + const results = []; + const linkRegex = /]*rel="nofollow"[^>]*href="([^"]*)"[^>]*class='result-link'[^>]*>([\s\S]*?)<\/a>/gi; + const snippetRegex = /([\s\S]*?)<\/td>/gi; + const linkTextRegex = /([\s\S]*?)<\/span>/gi; + + const links = [...html.matchAll(linkRegex)]; + const snippets = [...html.matchAll(snippetRegex)]; + const urls = [...html.matchAll(linkTextRegex)]; + + for (let i = 0; i < links.length && results.length < limit; i++) { + let rawUrl = decodeEntities(links[i][1].trim()); + const title = decodeEntities(stripTags(links[i][2]).trim()); + const snippet = snippets[i] ? decodeEntities(stripTags(snippets[i][1]).trim()) : ""; + const displayUrl = urls[i] ? decodeEntities(urls[i][1].trim()) : ""; + + const uddgMatch = rawUrl.match(/uddg=([^&]+)/); + let url = uddgMatch ? decodeURIComponent(uddgMatch[1]) : rawUrl; + + if (url.startsWith("//")) url = "https:" + url; + + let domain = "unknown"; + try { domain = new URL(url).hostname; } catch {} + if (domain === "unknown" && displayUrl) { + try { domain = new URL("https://" + displayUrl).hostname; } catch {} + } + + results.push({ title, url, domain, snippet, engine: "duckduckgo" }); + } + + return { results, engine: "duckduckgo", type: "web", total: results.length }; + } + + async _webViaHtml(query, limit, proxy, signal) { + const html = await this.http.fetch( + `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`, + { responseType: "text", proxy, signal, retries: 2, timeoutMs: 10000 } + ); + + const results = []; + const resultBlocks = html.split('
]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/i); + if (!titleMatch) continue; + + const snippetMatch = block.match(/]*class="[^"]*result__snippet[^"]*"[^>]*>([\s\S]*?)<\/a>/i); + + let rawUrl = decodeEntities(titleMatch[1].trim()); + const title = decodeEntities(stripTags(titleMatch[2]).trim()); + const snippet = snippetMatch ? decodeEntities(stripTags(snippetMatch[1]).trim()) : ""; + + const uddgMatch = rawUrl.match(/uddg=([^&]+)/); + let url = uddgMatch ? decodeURIComponent(uddgMatch[1]) : rawUrl; + if (url.startsWith("//")) url = "https:" + url; + + let domain = "unknown"; + try { domain = new URL(url).hostname; } catch {} + + results.push({ title, url, domain, snippet, engine: "duckduckgo" }); + } + + return { results, engine: "duckduckgo", type: "web", total: results.length }; + } + + async _imageSearch(query, limit, proxy, signal) { + const initHtml = await this.http.fetch( + `https://duckduckgo.com/?q=${encodeURIComponent(query)}&iax=images&ia=images`, + { responseType: "text", proxy, signal } + ); + + const vqdMatch = initHtml.match(/vqd\s*=\s*['"]([^'"]+)['"]/); + if (!vqdMatch) { + throw new Error("Could not extract token (vqd) from DuckDuckGo"); + } + + const data = await this.http.fetch( + `https://duckduckgo.com/i.js?q=${encodeURIComponent(query)}&o=json&vqd=${vqdMatch[1]}&f=,,,`, + { proxy, signal } + ); + + const rawResults = data.results || []; + const results = rawResults.slice(0, limit).map((item) => { + let domain = "unknown"; + try { domain = new URL(item.image).hostname; } + catch { + if (item.url) { + try { domain = new URL(item.url).hostname; } catch {} + } + } + + return { + title: item.title || "", + image_url: item.image || "", + source_url: item.url || "", + domain, + width: item.width || null, + height: item.height || null, + thumbnail: item.thumbnail || null, + engine: "duckduckgo" + }; + }); + + return { + results, + engine: "duckduckgo", + type: "image", + total: results.length + }; + } +} + +function decodeEntities(str) { + return str + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(///g, "/") + .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(code)); +} + +function stripTags(str) { + return str.replace(/<[^>]*>/g, "").trim(); +} diff --git a/src/engines/index.js b/src/engines/index.js new file mode 100644 index 0000000..77d4515 --- /dev/null +++ b/src/engines/index.js @@ -0,0 +1,24 @@ +const registry = []; + +export function registerEngine(EngineClass) { + registry.push(EngineClass); +} + +export function getEngines(type, httpClient) { + return registry + .filter(Engine => Engine.supports.includes(type)) + .sort((a, b) => a.priority - b.priority) + .map(Engine => new Engine(httpClient)); +} + +export function getEngineNames() { + return registry.map(Engine => ({ + name: Engine.name, + supports: Engine.supports, + priority: Engine.priority + })); +} + +export function clearEngines() { + registry.length = 0; +} diff --git a/src/engines/setup.js b/src/engines/setup.js new file mode 100644 index 0000000..e597944 --- /dev/null +++ b/src/engines/setup.js @@ -0,0 +1,17 @@ +import { DuckDuckGo } from "./duckduckgo.js"; +import { registerEngine, getEngines, getEngineNames, clearEngines } from "./index.js"; + +registerEngine(DuckDuckGo); + +export function createEngines(httpClient) { + return { + web: () => getEngines("web", httpClient), + image: () => getEngines("image", httpClient), + all: () => ({ + web: getEngines("web", httpClient), + image: getEngines("image", httpClient) + }) + }; +} + +export { getEngineNames, clearEngines, registerEngine }; diff --git a/src/http/client.js b/src/http/client.js new file mode 100644 index 0000000..a623c45 --- /dev/null +++ b/src/http/client.js @@ -0,0 +1,189 @@ +import { ProxyAgent } from "undici"; +import { getNextUA } from "../utils/ua.js"; +import { withRetry, isRetryable } from "../utils/retry.js"; +import { childLogger } from "../utils/logger.js"; + +const proxyAgentCache = new Map(); + +function getOrCreateProxyAgent(proxyUrl) { + if (!proxyUrl) return undefined; + if (!proxyAgentCache.has(proxyUrl)) { + proxyAgentCache.set(proxyUrl, new ProxyAgent(proxyUrl)); + } + return proxyAgentCache.get(proxyUrl); +} + +export class HttpClient { + constructor(config = {}) { + this.log = childLogger({ component: "http-client" }); + this.timeoutMs = config.timeout_ms || 10000; + this.retryAttempts = config.retry_max_attempts || 3; + this.retryBaseDelay = config.retry_base_delay_ms || 1000; + this.retryMaxDelay = config.retry_max_delay_ms || 10000; + this.userAgents = config.user_agents || null; + this.proxyPool = null; + } + + setProxyPool(pool) { + this.proxyPool = pool; + this.log.info({ proxyCount: pool.proxies?.length }, "proxy pool attached to HTTP client"); + } + + async fetch(url, options = {}) { + const { + method = "GET", + headers = {}, + body = null, + responseType = "json", + timeoutMs = this.timeoutMs, + retries = this.retryAttempts, + proxy = null, + signal = null, + engine = null + } = options; + + const requestLog = this.log.child({ + url: url.length > 100 ? url.substring(0, 100) + "..." : url, + method, + engine + }); + + requestLog.debug({ timeoutMs, retries }, "request starting"); + + return withRetry( + async (attempt) => { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + const combinedSignal = signal + ? combineSignals(signal, controller.signal) + : controller.signal; + + const ua = this.userAgents + ? this.userAgents[Math.floor(Math.random() * this.userAgents.length)] + : getNextUA(); + + const fetchHeaders = { + "User-Agent": ua, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + ...headers + }; + + let proxyUrl = proxy ? proxy.url : null; + let usedProxy = proxy; + + if (!proxy && this.proxyPool) { + const p = this.proxyPool.select(); + if (p) { + proxyUrl = p.url; + usedProxy = p; + } + } + + requestLog.debug({ attempt: attempt + 1, proxy: usedProxy?.masked || "none" }, "request attempt"); + + const requestStart = Date.now(); + + try { + const fetchOpts = { + method, + headers: fetchHeaders, + signal: combinedSignal, + ...(body ? { body } : {}) + }; + + if (proxyUrl) { + const agent = getOrCreateProxyAgent(proxyUrl); + fetchOpts.dispatcher = agent; + } + + const response = await fetch(url, fetchOpts); + + if (!response.ok) { + const err = new Error(`HTTP ${response.status}: ${response.statusText}`); + err.status = response.status; + requestLog.warn({ status: response.status }, "request returned non-ok status"); + throw err; + } + + if (response.status === 202) { + const bodyHint = (await response.text().catch(() => "")).substring(0, 120); + const err = new Error(`HTTP 202 (Accepted): server deferred or blocked request. Body: "${bodyHint}"`); + err.status = 202; + requestLog.warn({ status: 202, bodyHint }, "request returned HTTP 202 (blocked/deferred)"); + throw err; + } + + const textOrBuffer = await response.clone().text(); + const bytes = textOrBuffer.length; + const latencyMs = Date.now() - requestStart; + + if (this.proxyPool) this.proxyPool.markSuccess(usedProxy, latencyMs, bytes); + requestLog.debug({ status: response.status, bytes, latencyMs }, "request succeeded"); + + if (responseType === "json") { + const text = await response.text(); + try { + return JSON.parse(text); + } catch { + return text; + } + } + if (responseType === "text") return await response.text(); + return response; + } catch (err) { + const latencyMs = Date.now() - requestStart; + if (this.proxyPool) this.proxyPool.markFailed(usedProxy, err, latencyMs); + + if (err.name === "AbortError") { + requestLog.warn({ timeoutMs }, "request timed out"); + const timeoutErr = new Error(`Request timed out after ${timeoutMs}ms`); + timeoutErr.status = 0; + throw timeoutErr; + } + + if (!isRetryable(err) || attempt >= retries - 1) { + requestLog.error({ error: err.message, status: err.status, latencyMs }, "request failed, not retrying"); + throw err; + } + + requestLog.warn({ error: err.message, attempt: attempt + 1, latencyMs }, "request failed, will retry"); + throw err; + } finally { + clearTimeout(timeout); + } + }, + { + attempts: retries, + baseDelayMs: this.retryBaseDelay, + maxDelayMs: this.retryMaxDelay, + onRetry: ({ attempt, error, delayMs }) => { + requestLog.warn({ attempt, error: error.message, delayMs }, "retrying request"); + } + } + ); + } +} + +function combineSignals(...signals) { + const controller = new AbortController(); + + const onAbort = () => controller.abort(); + for (const signal of signals) { + if (signal.aborted) { + controller.abort(); + return controller.signal; + } + signal.addEventListener("abort", onAbort, { once: true }); + } + + const original = controller.signal; + original._cleanup = () => { + for (const signal of signals) { + signal.removeEventListener("abort", onAbort); + } + }; + + return original; +} diff --git a/src/http/providers/index.js b/src/http/providers/index.js new file mode 100644 index 0000000..41348d7 --- /dev/null +++ b/src/http/providers/index.js @@ -0,0 +1,39 @@ +import { childLogger } from "../../utils/logger.js"; + +const log = childLogger({ component: "proxy-provider" }); +const registry = []; + +export function registerProvider(name, fetcher, description = "") { + registry.push({ name, fetcher, description }); + log.info({ name, description }, "proxy provider registered"); +} + +export async function fetchAllProxies() { + const all = []; + + for (const { name, fetcher } of registry) { + try { + const proxies = await fetcher(); + if (proxies && proxies.length > 0) { + log.info({ provider: name, count: proxies.length }, "proxies fetched"); + for (const p of proxies) { + all.push(typeof p === "string" ? { url: p, provider: name } : { provider: name, ...p }); + } + } else { + log.debug({ provider: name }, "provider returned 0 proxies"); + } + } catch (err) { + log.warn({ provider: name, error: err.message }, "provider failed"); + } + } + + return all; +} + +export function listProviders() { + return registry.map(r => ({ name: r.name, description: r.description })); +} + +export function getProviderCount() { + return registry.length; +} diff --git a/src/http/providers/oxylabs.js b/src/http/providers/oxylabs.js new file mode 100644 index 0000000..5298d7a --- /dev/null +++ b/src/http/providers/oxylabs.js @@ -0,0 +1,26 @@ +import { registerProvider } from "./index.js"; + +const ENTRY_POINT = "dc.oxylabs.io"; +const PORT = "8000"; + +async function fetchOxylabsProxies() { + const username = process.env.OXYLABS_USERNAME; + const password = process.env.OXYLABS_PASSWORD; + const country = process.env.OXYLABS_COUNTRY || "US"; + const stickySession = process.env.OXYLABS_STICKY_SESSION || ""; + + if (!username || !password) return []; + + const sessParam = stickySession ? `-sessid-${stickySession}` : ""; + const entry = `${ENTRY_POINT}:${PORT}`; + const userPart = `user-${username}-country-${country}${sessParam}`; + const proxyUrl = `http://${encodeURIComponent(userPart)}:${encodeURIComponent(password)}@${entry}`; + + return [proxyUrl]; +} + +registerProvider( + "oxylabs", + fetchOxylabsProxies, + "Oxylabs datacenter proxies (dc.oxylabs.io:8000)" +); diff --git a/src/http/providers/webshare.js b/src/http/providers/webshare.js new file mode 100644 index 0000000..a088d11 --- /dev/null +++ b/src/http/providers/webshare.js @@ -0,0 +1,45 @@ +import { registerProvider, fetchAllProxies } from "./index.js"; + +const API_BASE = "https://proxy.webshare.io/api/v2/proxy/list/"; +const CACHE_TTL_MS = 5 * 60 * 1000; +let cached = null; +let lastFetch = 0; + +async function fetchWebshareProxies() { + const apiKey = process.env.WEBSHARE_API_KEY; + if (!apiKey) return []; + + if (cached && Date.now() - lastFetch < CACHE_TTL_MS) return cached; + + const allProxies = []; + let page = 1; + let hasMore = true; + + while (hasMore) { + const url = `${API_BASE}?mode=direct&page=${page}&page_size=100`; + const response = await fetch(url, { + headers: { Authorization: `Token ${apiKey}` } + }); + + if (!response.ok) { + throw new Error(`Webshare API HTTP ${response.status}`); + } + + const data = await response.json(); + + for (const p of data.results) { + if (p.valid) { + allProxies.push(`http://${p.username}:${p.password}@${p.proxy_address}:${p.port}`); + } + } + + hasMore = !!data.next; + page++; + } + + cached = allProxies; + lastFetch = Date.now(); + return allProxies; +} + +registerProvider("webshare", fetchWebshareProxies, "proxy.webshare.io API"); diff --git a/src/http/proxy.js b/src/http/proxy.js new file mode 100644 index 0000000..c6b9123 --- /dev/null +++ b/src/http/proxy.js @@ -0,0 +1,304 @@ +import { isRetryable } from "../utils/retry.js"; +import { childLogger } from "../utils/logger.js"; + +const HOUR_MS = 3600_000; + +export class ProxyPool { + constructor(proxies = [], options = {}) { + this.log = childLogger({ component: "proxy-pool" }); + this.enabled = options.enabled !== false && proxies.length > 0; + this.maxFailures = options.max_failures || 3; + this.circuitBaseMs = options.circuit_base_ms || 5000; + this.circuitMaxMs = options.circuit_max_ms || 300_000; + + const entries = []; + const seen = new Set(); + + for (const p of proxies) { + const url = typeof p === "string" ? p : p.url; + if (seen.has(url)) continue; + seen.add(url); + + const provider = typeof p === "string" ? "manual" : (p.provider || "manual"); + const hasAuth = url.includes("@"); + const masked = url.replace(/\/\/([^:]+):([^@]+)@/, "//$1:***@"); + + entries.push({ + url, + masked, + provider, + authenticated: hasAuth, + alive: true, + failures: 0, + consecutiveFailures: 0, + lastUsed: 0, + lastCheck: 0, + + circuitState: "closed", + circuitFailures: 0, + circuitOpenUntil: 0, + + hourlyUsage: {}, + totalRequests: 0, + totalSuccesses: 0, + totalFailures: 0, + totalTimeouts: 0, + totalRateLimited: 0, + totalBytes: 0, + totalLatencyMs: 0, + lastLatencyMs: 0 + }); + } + + this.proxies = [ + ...entries.filter(p => p.authenticated), + ...entries.filter(p => !p.authenticated) + ]; + + this.index = 0; + + if (this.enabled) { + const byProvider = {}; + for (const p of this.proxies) { + byProvider[p.provider] = (byProvider[p.provider] || 0) + 1; + } + this.log.info({ + count: this.proxies.length, + authenticated: this.proxies.filter(p => p.authenticated).length, + providers: byProvider + }, "proxy pool initialized"); + } + + this._metricsInterval = setInterval(() => { + this.log.info(this.getMetrics(), "proxy pool metrics periodic"); + }, 300_000).unref(); + } + + select() { + if (!this.enabled || this.proxies.length === 0) return null; + + const now = Date.now(); + const candidates = this.proxies.filter(p => { + if (!p.alive) return false; + if (p.circuitState === "open" && now < p.circuitOpenUntil) return false; + if (p.circuitState === "open" && now >= p.circuitOpenUntil) { + p.circuitState = "half-open"; + this.log.info({ proxy: p.masked }, "circuit half-open, allowing probe"); + } + return true; + }); + + if (candidates.length === 0) { + this.log.warn({ total: this.proxies.length }, "no candidates available, resetting dead proxies"); + for (const p of this.proxies) { + if (!p.alive) { + p.alive = true; + p.consecutiveFailures = 0; + } + } + const p = this.proxies[0]; + if (p) { + p.circuitState = "closed"; + p.lastUsed = now; + return p; + } + return null; + } + + const hourKey = this._hourKey(now); + candidates.sort((a, b) => { + const aUsage = a.hourlyUsage[hourKey] || 0; + const bUsage = b.hourlyUsage[hourKey] || 0; + if (aUsage !== bUsage) return aUsage - bUsage; + + if (a.consecutiveFailures !== b.consecutiveFailures) { + return a.consecutiveFailures - b.consecutiveFailures; + } + + const aCircuit = a.circuitFailures; + const bCircuit = b.circuitFailures; + return aCircuit - bCircuit; + }); + + const proxy = candidates[0]; + proxy.lastUsed = now; + proxy.hourlyUsage[hourKey] = (proxy.hourlyUsage[hourKey] || 0) + 1; + proxy.totalRequests++; + + return proxy; + } + + markSuccess(proxy, latencyMs = 0, bytes = 0) { + if (!proxy) return; + proxy.failures = 0; + proxy.consecutiveFailures = 0; + proxy.totalSuccesses++; + proxy.totalLatencyMs += latencyMs; + proxy.lastLatencyMs = latencyMs; + proxy.totalBytes += bytes; + proxy.lastCheck = Date.now(); + + if (proxy.circuitState === "half-open") { + proxy.circuitState = "closed"; + proxy.circuitFailures = 0; + this.log.info({ proxy: proxy.masked }, "circuit closed after successful probe"); + } + + proxy.alive = true; + } + + markFailed(proxy, error = null, latencyMs = 0) { + if (!proxy) return; + proxy.consecutiveFailures++; + proxy.failures++; + proxy.totalFailures++; + proxy.totalLatencyMs += latencyMs; + proxy.lastLatencyMs = latencyMs; + proxy.lastCheck = Date.now(); + + const is202 = error && (error.status === 202 || (error.message && error.message.includes("202"))); + const isTimeout = error && (error.message && (error.message.includes("timeout") || error.message.includes("timed out") || error.name === "AbortError")); + const isNonRetryable = error && !isRetryable(error); + + if (is202) { + proxy.totalRateLimited++; + this.log.warn({ proxy: proxy.masked, failures: proxy.failures }, "proxy rate-limited (202)"); + if (proxy.failures >= this.maxFailures) { + this._tripCircuit(proxy, "rate-limited"); + } + return; + } + + if (isTimeout) { + proxy.totalTimeouts++; + } + + if (isNonRetryable) { + this._tripCircuit(proxy, error.message); + return; + } + + if (proxy.failures >= this.maxFailures) { + proxy.alive = false; + this.log.warn({ + proxy: proxy.masked, failures: proxy.failures, + error: error?.message + }, "proxy marked dead"); + } else { + this.log.warn({ + proxy: proxy.masked, failures: proxy.failures, + error: error?.message + }, "proxy request failed"); + } + } + + _tripCircuit(proxy, reason) { + proxy.circuitFailures++; + const delay = Math.min( + this.circuitBaseMs * Math.pow(2, proxy.circuitFailures - 1), + this.circuitMaxMs + ); + proxy.circuitState = "open"; + proxy.circuitOpenUntil = Date.now() + delay; + + this.log.warn({ + proxy: proxy.masked, + circuitFailures: proxy.circuitFailures, + circuitOpenMs: delay, + reason + }, "circuit tripped"); + } + + getMetrics() { + const byProvider = {}; + let totalReq = 0, totalOk = 0, totalFail = 0; + + for (const p of this.proxies) { + const pr = byProvider[p.provider] || { + proxyCount: 0, alive: 0, dead: 0, + circuitOpen: 0, requests: 0, success: 0, failure: 0, + rateLimited: 0, timeout: 0, avgLatencyMs: 0, totalLatencyMs: 0, + hourlyUsage: {} + }; + pr.proxyCount++; + if (p.alive) pr.alive++; else pr.dead++; + if (p.circuitState === "open") pr.circuitOpen++; + pr.requests += p.totalRequests; + pr.success += p.totalSuccesses; + pr.failure += p.totalFailures; + pr.rateLimited += p.totalRateLimited; + pr.timeout += p.totalTimeouts; + pr.totalLatencyMs += p.totalLatencyMs; + + totalReq += p.totalRequests; + totalOk += p.totalSuccesses; + totalFail += p.totalFailures; + + const hk = this._hourKey(Date.now()); + pr.hourlyUsage[hk] = (pr.hourlyUsage[hk] || 0) + (p.hourlyUsage[hk] || 0); + + byProvider[p.provider] = pr; + } + + for (const [name, pr] of Object.entries(byProvider)) { + pr.avgLatencyMs = pr.success > 0 ? Math.round(pr.totalLatencyMs / pr.success) : 0; + delete pr.totalLatencyMs; + const hk = this._hourKey(Date.now()); + const hourly = pr.hourlyUsage[hk] || 0; + pr.hourlyUsage = { currentHour: hourly }; + pr.successRate = pr.requests > 0 ? (pr.success / pr.requests * 100).toFixed(1) + "%" : "0%"; + } + + const allHourly = this.proxies.reduce((s, p) => { + const hk = this._hourKey(Date.now()); + return s + (p.hourlyUsage[hk] || 0); + }, 0); + + return { + timestamp: new Date().toISOString(), + totalProxies: this.proxies.length, + alive: this.proxies.filter(p => p.alive).length, + dead: this.proxies.filter(p => !p.alive).length, + circuitOpen: this.proxies.filter(p => p.circuitState === "open").length, + requestsTotal: totalReq, + successTotal: totalOk, + failureTotal: totalFail, + successRate: totalReq > 0 ? (totalOk / totalReq * 100).toFixed(1) + "%" : "0%", + hourlyUsageCurrent: allHourly, + byProvider + }; + } + + getProxyDetail() { + return this.proxies.map(p => ({ + url: p.masked, + provider: p.provider, + alive: p.alive, + authenticated: p.authenticated, + circuitState: p.circuitState, + circuitFailures: p.circuitFailures, + failures: p.failures, + consecutiveFailures: p.consecutiveFailures, + requests: p.totalRequests, + successes: p.totalSuccesses, + failures: p.totalFailures, + rateLimited: p.totalRateLimited, + timeouts: p.totalTimeouts, + avgLatencyMs: p.totalSuccesses > 0 ? Math.round(p.totalLatencyMs / p.totalSuccesses) : 0, + lastLatencyMs: p.lastLatencyMs, + hourlyUsage: p.hourlyUsage[this._hourKey(Date.now())] || 0 + })); + } + + _hourKey(ts) { + return Math.floor(ts / HOUR_MS); + } + + logState() { + this.log.info({ metrics: this.getMetrics() }, "proxy pool state"); + } + + destroy() { + if (this._metricsInterval) clearInterval(this._metricsInterval); + } +} diff --git a/src/index.js b/src/index.js new file mode 100755 index 0000000..970ed4b --- /dev/null +++ b/src/index.js @@ -0,0 +1,160 @@ +#!/usr/bin/env node + +import { loadConfig } from "./config.js"; +import { HttpClient } from "./http/client.js"; +import { ProxyPool } from "./http/proxy.js"; +import { SearchRunner } from "./run.js"; +import { parseCliArgs } from "./cli.js"; +import { setUserAgents } from "./utils/ua.js"; +import { getEngineNames } from "./engines/setup.js"; +import { logger, childLogger } from "./utils/logger.js"; +import { renderCombinedResults } from "./output/human.js"; +import { formatSearchResponse, formatErrorResponse, formatHelpResponse } from "./output/agent.js"; +import { generateKey, saveKeyToEnv } from "./api-key.js"; +import { startServer } from "./api.js"; + +async function main() { + const log = childLogger({ component: "main" }); + const args = parseCliArgs(); + + if (args.help) { + const engines = getEngineNames(); + console.log(formatHelpResponse(engines, args)); + console.log("\n\x1b[1mAPI Server:\x1b[0m"); + console.log(" --serve Start API server"); + console.log(" --port Port for API server (default: 9876)"); + console.log(" --generate-key Generate a new API key\n"); + process.exit(0); + } + + if (args.generateKey) { + const key = generateKey(); + const result = saveKeyToEnv(key); + console.log(`\n \x1b[32m✓\x1b[0m New API key generated`); + console.log(` Key: ${key}`); + console.log(` Stored: ${result.path}`); + console.log(` \n Use with: curl -H "Authorization: Bearer ${key}" ...\n`); + process.exit(0); + } + + if (args.serve) { + await startServer(args.port); + return; + } + + if (!args.query) { + if (args.metrics) { + log.warn("--metrics requires a search to run first; run a query and send SIGUSR1 for live metrics"); + } + if (args.mode === "agent") { + console.error(formatErrorResponse( + { code: "MISSING_QUERY", message: "Search query parameter is required" }, + { type: args.type } + )); + } else { + console.error("\n\x1b[31mError: Missing search query\x1b[0m\n"); + console.log("Usage: node src/index.js -q \"search term\" [options]"); + console.log(" -q, --query Search query (required)"); + console.log(" -t, --type Search type: web, image, or both (default: image)"); + console.log(" -l, --limit Max results per type (default: 10)"); + console.log(" -m, --mode Output mode: human or agent (default: human)"); + console.log(" -p, --proxy Proxy URL (e.g., http://user:pass@host:port)"); + console.log(" -h, --help Show help\n"); + } + process.exit(1); + } + + log.info({ query: args.query, type: args.type, limit: args.limit, mode: args.mode }, "starting search"); + + const config = await loadConfig(args.configPath); + log.info({ proxyCount: config.proxies.length, proxyEnabled: config.proxy.enabled }, "config loaded"); + + if (config.http.user_agents) { + setUserAgents(config.http.user_agents); + log.debug({ count: config.http.user_agents.length }, "custom user agents loaded"); + } + + const httpClient = new HttpClient(config.http); + const proxyPool = new ProxyPool(config.proxies, config.proxy); + + if (config.proxy.enabled) { + httpClient.setProxyPool(proxyPool); + proxyPool.logState(); + } + + if (args.proxy) { + log.info({ proxy: args.proxy.replace(/\/\/([^:]+):([^@]+)@/, "//$1:***@") }, "using CLI proxy override"); + const singlePool = new ProxyPool([args.proxy], { enabled: true }); + httpClient.setProxyPool(singlePool); + } + + process.on("SIGUSR1", () => { + console.log("\n--- Proxy Pool Metrics (SIGUSR1) ---"); + console.log(JSON.stringify(proxyPool.getMetrics(), null, 2)); + console.log("--- Proxy Detail ---"); + console.log(JSON.stringify(proxyPool.getProxyDetail(), null, 2)); + }); + + const runner = new SearchRunner({ httpClient, config }); + + try { + const data = await runner.run({ + query: args.query, + type: args.type, + limit: args.limit + }); + + if (args.mode === "agent") { + console.log(formatSearchResponse(data)); + } else { + if (data.image || data.web) { + renderCombinedResults({ + query: data.query, + image: data.image, + web: data.web + }); + } + + if (data.errors && data.errors.length > 0) { + const failed = data.errors.filter(e => e.code === "ENGINE_FAILED"); + if (failed.length > 0) { + console.error(`\n \x1b[33mWarning: ${failed.length} engine(s) failed, used fallback\x1b[0m`); + } + } + + if (args.metrics) { + console.log("\n\x1b[1m\x1b[36m── Proxy Pool Metrics ──\x1b[0m"); + console.log(JSON.stringify(proxyPool.getMetrics(), null, 2)); + console.log("\n\x1b[1m\x1b[36m── Per-Proxy Detail ──\x1b[0m"); + console.log(JSON.stringify(proxyPool.getProxyDetail(), null, 2)); + } + } + } catch (err) { + log.error({ error: err.message }, "search failed with unhandled error"); + if (args.mode === "agent") { + console.error(formatErrorResponse(err, { query: args.query, type: args.type })); + } else { + console.error(`\n\x1b[31mError: ${err.message}\x1b[0m\n`); + } + process.exit(1); + } +} + +export async function search(opts) { + const config = await loadConfig(opts?.configPath); + const httpClient = new HttpClient(config.http); + + if (config.proxy.enabled) { + const proxyPool = new ProxyPool(config.proxies, config.proxy); + httpClient.setProxyPool(proxyPool); + } + + const runner = new SearchRunner({ httpClient, config }); + return runner.run({ + query: opts.query, + type: opts.type || "image", + limit: opts.limit || 10 + }); +} + +main(); diff --git a/src/output/agent.js b/src/output/agent.js new file mode 100644 index 0000000..72e5f17 --- /dev/null +++ b/src/output/agent.js @@ -0,0 +1,133 @@ +export function formatSearchResponse(data) { + const response = { + success: true, + query: data.query, + type: data.type, + timestamp: data.timestamp || new Date().toISOString(), + execution_time_ms: data.executionTimeMs || 0, + engines_used: data.enginesUsed || [] + }; + + if (data.image) { + response.image = formatResultsSection(data.image, "image"); + } + + if (data.web) { + response.web = formatResultsSection(data.web, "web"); + } + + if (data.errors && data.errors.length > 0) { + response.errors = data.errors; + } + + return JSON.stringify(response, null, 2); +} + +export function formatErrorResponse(error, options = {}) { + const response = { + success: false, + error: { + code: error.code || "SEARCH_FAILED", + message: error.message || "An unknown error occurred", + type: error.constructor?.name || "Error" + }, + query: options.query || null, + type: options.type || null, + timestamp: new Date().toISOString() + }; + + return JSON.stringify(response, null, 2); +} + +export function formatHelpResponse(engines, options) { + const response = { + success: true, + help: { + name: "polysearch", + description: "Multi-engine web + image search with proxy rotation, circuit breaker, and AI agent output", + version: "2.0.0", + usage: "node src/index.js [OPTIONS]", + options: { + query: { flag: "-q, --query ", required: true, description: "Search query" }, + type: { flag: "-t, --type ", required: false, default: "image", description: "Search type: web, image, or both" }, + limit: { flag: "-l, --limit ", required: false, default: 10, description: "Max results per type" }, + mode: { flag: "-m, --mode ", required: false, default: "human", description: "Output mode: human or agent" }, + config: { flag: "-c, --config ", required: false, description: "Path to config file" }, + proxy: { flag: "-p, --proxy ", required: false, description: "Single proxy URL to use" }, + help: { flag: "-h, --help", required: false, default: false, description: "Show this help" } + }, + examples: [ + "node src/index.js -q \"vintage radio\"", + "node src/index.js -q \"spacex launch\" -t both -l 15 --mode agent", + "node src/index.js -q \"cats\" -t web -l 5 -p http://user:pass@proxy:8080", + "node src/index.js -q \"1950 computer\" -m agent" + ], + engines: engines.map(e => ({ + name: e.name, + supports: e.supports, + priority: e.priority + })), + output_modes: { + human: "Colorized terminal output for humans", + agent: "Structured JSON for AI agent consumption" + } + } + }; + + return JSON.stringify(response, null, 2); +} + +function formatResultsSection(section, type) { + const out = { + engine: section.engine || "unknown", + total: section.total || 0 + }; + + if (type === "image") { + out.results = (section.results || []).map((item, i) => ({ + index: i + 1, + title: item.title || null, + image_url: item.image_url || null, + source_url: item.source_url || null, + domain: item.domain || "unknown", + width: item.width || null, + height: item.height || null, + thumbnail: item.thumbnail || null, + engine: item.engine || section.engine || null + })); + out.statistics = { + avg_width: avg(section.results, "width"), + avg_height: avg(section.results, "height"), + domains: countBy(section.results, "domain") + }; + } else { + out.results = (section.results || []).map((item, i) => ({ + index: i + 1, + title: item.title || null, + url: item.url || null, + domain: item.domain || "unknown", + snippet: item.snippet || null, + engine: item.engine || section.engine || null + })); + out.statistics = { + domains: countBy(section.results, "domain") + }; + } + + return out; +} + +function countBy(arr, key) { + const counts = {}; + for (const item of arr || []) { + const val = item[key] || "unknown"; + counts[val] = (counts[val] || 0) + 1; + } + return counts; +} + +function avg(arr, key) { + const nums = (arr || []).map(i => i[key]).filter(n => n != null); + if (nums.length === 0) return null; + return Math.round(nums.reduce((s, n) => s + n, 0) / nums.length); +} diff --git a/src/output/human.js b/src/output/human.js new file mode 100644 index 0000000..a8895c1 --- /dev/null +++ b/src/output/human.js @@ -0,0 +1,72 @@ +export function renderImageResults(results, query) { + if (!results || results.length === 0) { + console.log(`\n No image results found for: "${query}"\n`); + return; + } + + console.log(`\n\x1b[1m\x1b[36mImage Results for:\x1b[0m "${query}"`); + console.log(`\x1b[90m${"─".repeat(80)}\x1b[0m\n`); + + for (const img of results) { + const reset = "\x1b[0m"; + const dims = (img.width && img.height) ? `${img.width}\u00d7${img.height}` : "Unknown"; + + console.log(`\x1b[1m[${img.index}]\x1b[0m`); + console.log(` \x1b[1mDescription:\x1b[0m ${img.title || "No description"}`); + console.log(` \x1b[1mDomain:\x1b[0m ${colorDomain(img.domain)}${img.domain}${reset}`); + console.log(` \x1b[1mDimensions:\x1b[0m ${dims}`); + console.log(` \x1b[1mImage URL:\x1b[0m \x1b[4m${img.image_url}${reset}`); + if (img.source_url) { + console.log(` \x1b[1mSource Page:\x1b[0m \x1b[4m${img.source_url}${reset}`); + } + console.log(`\x1b[90m${"─".repeat(80)}\x1b[0m`); + } + + console.log(` Found ${results.length} image result${results.length !== 1 ? "s" : ""}\n`); +} + +export function renderWebResults(results, query) { + if (!results || results.length === 0) { + console.log(`\n No web results found for: "${query}"\n`); + return; + } + + console.log(`\n\x1b[1m\x1b[36mWeb Results for:\x1b[0m "${query}"`); + console.log(`\x1b[90m${"─".repeat(80)}\x1b[0m\n`); + + for (const r of results) { + console.log(`\x1b[1m[${r.index}] ${r.title}\x1b[0m`); + console.log(` \x1b[1mURL:\x1b[0m \x1b[4m${r.url}${"\x1b[0m"}`); + console.log(` \x1b[1mDomain:\x1b[0m \x1b[32m${r.domain}\x1b[0m`); + if (r.snippet) { + console.log(` \x1b[1mSnippet:\x1b[0m ${r.snippet.slice(0, 200)}`); + } + console.log(`\x1b[90m${"─".repeat(80)}\x1b[0m`); + } + + console.log(` Found ${results.length} web result${results.length !== 1 ? "s" : ""}\n`); +} + +export function renderCombinedResults(data) { + if (data.image && data.image.results) { + renderImageResults( + data.image.results.map((r, i) => ({ ...r, index: i + 1 })), + data.query + ); + } + if (data.web && data.web.results) { + renderWebResults( + data.web.results.map((r, i) => ({ ...r, index: i + 1 })), + data.query + ); + } +} + +function colorDomain(domain) { + const lower = domain.toLowerCase(); + if (lower.includes("shutterstock") || lower.includes("alamy") || + lower.includes("istock") || lower.includes("getty")) return "\x1b[31m"; + if (lower.endsWith(".edu") || lower.endsWith(".gov") || lower.includes("wikimedia")) return "\x1b[35m"; + if (lower.includes("unsplash") || lower.includes("pexels") || lower.includes("pixabay")) return "\x1b[33m"; + return "\x1b[32m"; +} diff --git a/src/run.js b/src/run.js new file mode 100644 index 0000000..5ede74b --- /dev/null +++ b/src/run.js @@ -0,0 +1,123 @@ +import { createEngines } from "./engines/setup.js"; +import { childLogger } from "./utils/logger.js"; + +export class SearchRunner { + constructor({ httpClient, config = {} }) { + this.log = childLogger({ component: "runner" }); + this.http = httpClient; + this.config = config; + this.engines = createEngines(httpClient); + } + + async run({ query, type = "image", limit = 10, proxy = null, signal = null }) { + const startTime = Date.now(); + const types = type === "both" ? ["web", "image"] : [type]; + const result = { + query, + type, + timestamp: new Date().toISOString(), + executionTimeMs: 0, + enginesUsed: [], + errors: [] + }; + + this.log.info({ query, type, limit }, "search started"); + + for (const t of types) { + const engines = t === "web" ? this.engines.web() : this.engines.image(); + + this.log.debug({ type: t, engineCount: engines.length }, "searching type"); + + if (engines.length === 0) { + this.log.error({ type: t }, "no registered engines for type"); + result.errors.push({ + type: t, + code: "NO_ENGINES", + message: `No registered engines support "${t}" search` + }); + continue; + } + + const { section, usedEngines, errors } = await this._tryEngines( + engines, query, t, limit, proxy, signal + ); + + result[t] = section; + result.enginesUsed.push(...usedEngines); + result.errors.push(...errors); + } + + result.executionTimeMs = Date.now() - startTime; + + this.log.info({ + executionTimeMs: result.executionTimeMs, + enginesUsed: result.enginesUsed, + errors: result.errors.length, + imageCount: result.image?.results?.length || 0, + webCount: result.web?.results?.length || 0 + }, "search completed"); + + return result; + } + + async _tryEngines(engines, query, type, limit, proxy, signal) { + let section = null; + const usedEngines = []; + const errors = []; + + for (const engine of engines) { + if (section) break; + + this.log.debug({ engine: engine.engineName, type }, "trying engine"); + + try { + const engineResult = await engine.search(query, { + type, + limit, + proxy, + signal + }); + + if (engineResult && engineResult.results && engineResult.results.length > 0) { + this.log.info({ + engine: engine.engineName, + type, + resultCount: engineResult.results.length + }, "engine returned results"); + + section = { + engine: engineResult.engine, + total: engineResult.results.length, + results: engineResult.results.map((r, i) => ({ + ...r, + index: i + 1 + })) + }; + usedEngines.push(engine.engineName); + } else { + this.log.debug({ engine: engine.engineName, type }, "engine returned 0 results"); + } + } catch (err) { + this.log.warn({ + engine: engine.engineName, + type, + error: err.message + }, "engine failed"); + errors.push({ + type, + engine: engine.engineName, + code: "ENGINE_FAILED", + message: err.message + }); + usedEngines.push(`${engine.engineName}(failed)`); + } + } + + if (!section) { + this.log.warn({ type }, "all engines failed for type"); + section = { engine: "none", total: 0, results: [] }; + } + + return { section, usedEngines, errors }; + } +} diff --git a/src/utils/logger.js b/src/utils/logger.js new file mode 100644 index 0000000..533a432 --- /dev/null +++ b/src/utils/logger.js @@ -0,0 +1,23 @@ +import pino from "pino"; + +const level = process.env.LOG_LEVEL || "info"; + +export const logger = pino({ + level, + transport: process.stdout.isTTY + ? { + target: "pino/file", + options: { colorize: true } + } + : undefined, + formatters: { + level(label) { + return { level: label }; + } + }, + timestamp: pino.stdTimeFunctions.isoTime +}); + +export function childLogger(bindings) { + return logger.child(bindings); +} diff --git a/src/utils/retry.js b/src/utils/retry.js new file mode 100644 index 0000000..7c5447d --- /dev/null +++ b/src/utils/retry.js @@ -0,0 +1,45 @@ +export async function withRetry(fn, { + attempts = 3, + baseDelayMs = 1000, + maxDelayMs = 10000, + onRetry = null +} = {}) { + let lastError; + + for (let i = 0; i < attempts; i++) { + try { + return await fn(i); + } catch (err) { + lastError = err; + if (i === attempts - 1) break; + + const delay = Math.min( + baseDelayMs * Math.pow(2, i) + Math.random() * baseDelayMs, + maxDelayMs + ); + + if (onRetry) onRetry({ attempt: i + 1, error: err, delayMs: Math.round(delay) }); + + await new Promise(r => setTimeout(r, delay)); + } + } + + throw lastError; +} + +export function isRetryable(err) { + if (!err) return false; + const msg = String(err.message || err).toLowerCase(); + const status = err.status || err.statusCode || 0; + + if (status >= 500 && status < 600) return true; + if (status === 429) return true; + if (status === 0) return true; + if (msg.includes('timeout') || msg.includes('timed out')) return true; + if (msg.includes('econnrefused') || msg.includes('econnreset')) return true; + if (msg.includes('etimedout') || msg.includes('enotfound') || msg.includes('eai_again')) return true; + if (msg.includes('network') || msg.includes('socket') || msg.includes('fetch failed')) return true; + if (msg.includes('could not extract token')) return false; + + return false; +} diff --git a/src/utils/ua.js b/src/utils/ua.js new file mode 100644 index 0000000..092d05d --- /dev/null +++ b/src/utils/ua.js @@ -0,0 +1,29 @@ +const DEFAULT_AGENTS = [ + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.5; rv:127.0) Gecko/20100101 Firefox/127.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0", + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1" +]; + +let agents = [...DEFAULT_AGENTS]; +let index = 0; + +export function setUserAgents(list) { + if (Array.isArray(list) && list.length > 0) { + agents = list; + } +} + +export function getRandomUA() { + return agents[Math.floor(Math.random() * agents.length)]; +} + +export function getNextUA() { + const ua = agents[index % agents.length]; + index++; + return ua; +}