feat: crawl-manufacturer agent script — Haiku tool-use loop + bulk upsert
This commit is contained in:
308
scripts/crawl-manufacturer.ts
Normal file
308
scripts/crawl-manufacturer.ts
Normal file
@@ -0,0 +1,308 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Crawl a manufacturer's website and upsert their products into the GearBox catalog.
|
||||
*
|
||||
* Usage:
|
||||
* bun run scripts/crawl-manufacturer.ts --manufacturer=apidura
|
||||
* bun run scripts/crawl-manufacturer.ts --manufacturer=canyon --dry-run
|
||||
*
|
||||
* Env vars required:
|
||||
* ANTHROPIC_API_KEY — Anthropic API key
|
||||
* GEARBOX_URL — Base URL of the GearBox instance (default: http://localhost:3000)
|
||||
* GEARBOX_API_KEY — GearBox API key with write access
|
||||
*/
|
||||
|
||||
import Anthropic from "@anthropic-ai/sdk";
|
||||
import { CATEGORIES } from "./taxonomy/categories.ts";
|
||||
import { TAGS } from "./taxonomy/tags.ts";
|
||||
|
||||
const GEARBOX_URL = process.env.GEARBOX_URL ?? "http://localhost:3000";
|
||||
const GEARBOX_API_KEY = process.env.GEARBOX_API_KEY ?? "";
|
||||
const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY ?? "";
|
||||
const MODEL = "claude-haiku-4-5-20251001";
|
||||
const MAX_TOOL_ROUNDS = 30; // safety limit
|
||||
|
||||
// ── Parse CLI args ────────────────────────────────────────────────
|
||||
|
||||
const args = Object.fromEntries(
|
||||
process.argv
|
||||
.slice(2)
|
||||
.filter((a) => a.startsWith("--"))
|
||||
.map((a) => {
|
||||
const [k, v] = a.slice(2).split("=");
|
||||
return [k, v ?? "true"];
|
||||
}),
|
||||
);
|
||||
|
||||
const manufacturerSlug = args["manufacturer"];
|
||||
const dryRun = args["dry-run"] === "true";
|
||||
|
||||
if (!manufacturerSlug) {
|
||||
console.error("Usage: bun run scripts/crawl-manufacturer.ts --manufacturer=<slug>");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!GEARBOX_API_KEY) {
|
||||
console.error("GEARBOX_API_KEY env var is required");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!ANTHROPIC_API_KEY) {
|
||||
console.error("ANTHROPIC_API_KEY env var is required");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// ── Fetch manufacturer from GearBox ──────────────────────────────
|
||||
|
||||
async function fetchManufacturer(slug: string) {
|
||||
const res = await fetch(`${GEARBOX_URL}/api/manufacturers/${slug}`);
|
||||
if (!res.ok) {
|
||||
throw new Error(`Manufacturer not found: ${slug} (HTTP ${res.status})`);
|
||||
}
|
||||
return res.json() as Promise<{
|
||||
id: number;
|
||||
name: string;
|
||||
slug: string;
|
||||
website: string;
|
||||
tier: number;
|
||||
country: string | null;
|
||||
}>;
|
||||
}
|
||||
|
||||
// ── Tool: fetch a web page ────────────────────────────────────────
|
||||
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; GearBox-Catalog-Bot/1.0)",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
},
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!res.ok) return `HTTP ${res.status} for ${url}`;
|
||||
const html = await res.text();
|
||||
// Strip scripts, styles, and excessive whitespace for token efficiency
|
||||
return html
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
|
||||
.replace(/<!--[\s\S]*?-->/g, "")
|
||||
.replace(/\s{3,}/g, " ")
|
||||
.slice(0, 60_000); // cap at 60k chars to stay within context
|
||||
} catch (err) {
|
||||
return `Error fetching ${url}: ${(err as Error).message}`;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Build system prompt ───────────────────────────────────────────
|
||||
|
||||
function buildSystemPrompt(manufacturer: Awaited<ReturnType<typeof fetchManufacturer>>) {
|
||||
return `You are a product data extraction agent for GearBox, a gear management app for bikepacking, cycling, and hiking.
|
||||
|
||||
Your task: crawl ${manufacturer.name}'s website (${manufacturer.website}) and extract their complete product catalog.
|
||||
|
||||
For each product, extract:
|
||||
- model: string (product name WITHOUT the brand prefix)
|
||||
- category: one of [${CATEGORIES.join(", ")}]
|
||||
- weightGrams: number | null (weight in grams — convert if shown in oz/lbs/kg)
|
||||
- priceCents: number | null (MSRP in cents, base currency)
|
||||
- priceCurrency: string (ISO currency code — "EUR" for DE brands, "USD" for US, "GBP" for GB, etc.)
|
||||
- description: string | null (1-3 sentence product description)
|
||||
- sourceUrl: string (direct product page URL)
|
||||
- tags: string[] (from this list only: [${TAGS.join(", ")}])
|
||||
|
||||
Rules:
|
||||
- model must NOT include the brand name (e.g., "Terrapin System" not "Revelate Designs Terrapin System")
|
||||
- Only include outdoor/adventure/cycling products. Skip accessories under €5, clothing if not relevant to the target categories.
|
||||
- If weight is not listed on a product page, use null — do not guess.
|
||||
- Assign 2-5 relevant tags per item.
|
||||
- Extract every product in their catalog, not just featured ones. Navigate to all relevant subcategories.
|
||||
|
||||
When done, output a JSON array of product objects as your final message. Do not wrap in markdown — raw JSON only.
|
||||
|
||||
Example output:
|
||||
[
|
||||
{
|
||||
"model": "Expedition Handlebar Pack",
|
||||
"category": "bags",
|
||||
"weightGrams": 300,
|
||||
"priceCents": 16000,
|
||||
"priceCurrency": "GBP",
|
||||
"description": "14L waterproof handlebar roll bag with internal dry bag and accessory pocket.",
|
||||
"sourceUrl": "https://apidura.com/shop/expedition-handlebar-pack/",
|
||||
"tags": ["bikepacking", "handlebar-bag", "bike-bag"]
|
||||
}
|
||||
]`;
|
||||
}
|
||||
|
||||
// ── Agentic tool-use loop ─────────────────────────────────────────
|
||||
|
||||
type CatalogItem = {
|
||||
model: string;
|
||||
category: string;
|
||||
weightGrams: number | null;
|
||||
priceCents: number | null;
|
||||
priceCurrency: string;
|
||||
description: string | null;
|
||||
sourceUrl: string;
|
||||
tags: string[];
|
||||
};
|
||||
|
||||
async function runCrawlAgent(manufacturer: Awaited<ReturnType<typeof fetchManufacturer>>): Promise<CatalogItem[]> {
|
||||
const client = new Anthropic({ apiKey: ANTHROPIC_API_KEY });
|
||||
|
||||
const tools: Anthropic.Tool[] = [
|
||||
{
|
||||
name: "fetch_page",
|
||||
description: "Fetch the HTML content of a URL. Use this to explore the manufacturer's website and product pages.",
|
||||
input_schema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
url: { type: "string", description: "The URL to fetch" },
|
||||
},
|
||||
required: ["url"],
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
const messages: Anthropic.MessageParam[] = [
|
||||
{
|
||||
role: "user",
|
||||
content: `Crawl ${manufacturer.name}'s website at ${manufacturer.website} and extract their complete product catalog. Start with the homepage or sitemap, navigate to all product categories, and return the full product list as JSON.`,
|
||||
},
|
||||
];
|
||||
|
||||
let rounds = 0;
|
||||
|
||||
while (rounds < MAX_TOOL_ROUNDS) {
|
||||
rounds++;
|
||||
console.log(` [round ${rounds}] calling model...`);
|
||||
|
||||
const response = await client.messages.create({
|
||||
model: MODEL,
|
||||
max_tokens: 8192,
|
||||
system: buildSystemPrompt(manufacturer),
|
||||
tools,
|
||||
messages,
|
||||
});
|
||||
|
||||
// Add assistant response to history
|
||||
messages.push({ role: "assistant", content: response.content });
|
||||
|
||||
if (response.stop_reason === "end_turn") {
|
||||
// Final message — extract JSON from text content
|
||||
const textBlock = response.content.find((b) => b.type === "text");
|
||||
if (!textBlock || textBlock.type !== "text") {
|
||||
throw new Error("Agent finished without text output");
|
||||
}
|
||||
return parseAgentOutput(textBlock.text);
|
||||
}
|
||||
|
||||
if (response.stop_reason !== "tool_use") {
|
||||
throw new Error(`Unexpected stop reason: ${response.stop_reason}`);
|
||||
}
|
||||
|
||||
// Process tool calls
|
||||
const toolResults: Anthropic.ToolResultBlockParam[] = [];
|
||||
for (const block of response.content) {
|
||||
if (block.type !== "tool_use") continue;
|
||||
if (block.name === "fetch_page") {
|
||||
const { url } = block.input as { url: string };
|
||||
console.log(` [tool] fetch_page ${url}`);
|
||||
const content = await fetchPage(url);
|
||||
toolResults.push({
|
||||
type: "tool_result",
|
||||
tool_use_id: block.id,
|
||||
content,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
messages.push({ role: "user", content: toolResults });
|
||||
}
|
||||
|
||||
throw new Error(`Agent exceeded ${MAX_TOOL_ROUNDS} tool rounds without finishing`);
|
||||
}
|
||||
|
||||
function parseAgentOutput(text: string): CatalogItem[] {
|
||||
// Handle agent wrapping output in markdown code blocks
|
||||
const cleaned = text.replace(/^```json\s*/i, "").replace(/\s*```$/i, "").trim();
|
||||
const parsed = JSON.parse(cleaned);
|
||||
if (!Array.isArray(parsed)) throw new Error("Agent output is not a JSON array");
|
||||
return parsed;
|
||||
}
|
||||
|
||||
// ── Upsert to GearBox API ─────────────────────────────────────────
|
||||
|
||||
async function upsertItems(
|
||||
slug: string,
|
||||
items: CatalogItem[],
|
||||
): Promise<{ created: number; updated: number }> {
|
||||
const payload = items.map((item) => ({
|
||||
manufacturerSlug: slug,
|
||||
model: item.model,
|
||||
category: item.category,
|
||||
weightGrams: item.weightGrams ?? undefined,
|
||||
priceCents: item.priceCents ?? undefined,
|
||||
description: item.description ?? undefined,
|
||||
sourceUrl: item.sourceUrl,
|
||||
tags: item.tags,
|
||||
}));
|
||||
|
||||
// Chunk into batches of 100 (API limit)
|
||||
let totalCreated = 0;
|
||||
let totalUpdated = 0;
|
||||
|
||||
for (let i = 0; i < payload.length; i += 100) {
|
||||
const batch = payload.slice(i, i + 100);
|
||||
const res = await fetch(`${GEARBOX_URL}/api/global-items/bulk`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"X-API-Key": GEARBOX_API_KEY,
|
||||
},
|
||||
body: JSON.stringify({ items: batch }),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const err = await res.text();
|
||||
throw new Error(`Bulk upsert failed (HTTP ${res.status}): ${err}`);
|
||||
}
|
||||
|
||||
const result = await res.json() as { created: number; updated: number };
|
||||
totalCreated += result.created;
|
||||
totalUpdated += result.updated;
|
||||
console.log(` batch ${Math.floor(i / 100) + 1}: +${result.created} new, ~${result.updated} updated`);
|
||||
}
|
||||
|
||||
return { created: totalCreated, updated: totalUpdated };
|
||||
}
|
||||
|
||||
// ── Main ──────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log(`\nCrawling manufacturer: ${manufacturerSlug}`);
|
||||
if (dryRun) console.log("DRY RUN — products will not be saved\n");
|
||||
|
||||
const manufacturer = await fetchManufacturer(manufacturerSlug);
|
||||
console.log(`Found: ${manufacturer.name} (${manufacturer.website})\n`);
|
||||
|
||||
console.log("Starting agent crawl...");
|
||||
const items = await runCrawlAgent(manufacturer);
|
||||
console.log(`\nAgent extracted ${items.length} products`);
|
||||
|
||||
if (dryRun) {
|
||||
console.log("\nDry run output (first 3 items):");
|
||||
console.log(JSON.stringify(items.slice(0, 3), null, 2));
|
||||
return;
|
||||
}
|
||||
|
||||
console.log("\nUpserting to catalog...");
|
||||
const { created, updated } = await upsertItems(manufacturerSlug, items);
|
||||
console.log(`\nDone: ${created} created, ${updated} updated`);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user