--- phase: 29 plan: 04 type: backend wave: 2 depends_on: [01] files_modified: - scripts/backfill-dominant-colors.ts autonomous: true requirements: [] --- Create a one-time backfill script that processes all existing images in the database to extract and store their dominant color. Handles items, globalItems, and threadCandidates with imageFilename, plus globalItems with external imageUrl. ### Task 1: Create backfill script - src/db/schema.ts - src/server/services/storage.service.ts - src/server/services/image.service.ts Create `scripts/backfill-dominant-colors.ts`: ```ts /** * Backfill dominant colors for all existing images. * Run with: bun run scripts/backfill-dominant-colors.ts * * Idempotent — skips records that already have dominantColor set. * Processes in batches of 10 concurrent requests. */ import { GetObjectCommand, S3Client } from "@aws-sdk/client-s3"; import { drizzle } from "drizzle-orm/postgres-js"; import { isNull } from "drizzle-orm"; import postgres from "postgres"; import sharp from "sharp"; import * as schema from "../src/db/schema"; const DATABASE_URL = process.env.DATABASE_URL; if (!DATABASE_URL) throw new Error("DATABASE_URL required"); const client = postgres(DATABASE_URL); const db = drizzle(client, { schema }); const s3 = new S3Client({ endpoint: process.env.S3_ENDPOINT, region: process.env.S3_REGION ?? "us-east-1", credentials: { accessKeyId: process.env.S3_ACCESS_KEY!, secretAccessKey: process.env.S3_SECRET_KEY!, }, forcePathStyle: true, }); const bucket = process.env.S3_BUCKET ?? "gearbox-images"; async function extractColor(buffer: Buffer): Promise { try { const { data } = await sharp(buffer).resize(1, 1).raw().toBuffer({ resolveWithObject: true }); return `#${data[0].toString(16).padStart(2, "0")}${data[1].toString(16).padStart(2, "0")}${data[2].toString(16).padStart(2, "0")}`; } catch { return null; } } async function fetchFromS3(filename: string): Promise { try { const response = await s3.send(new GetObjectCommand({ Bucket: bucket, Key: filename })); const bytes = await response.Body?.transformToByteArray(); return bytes ? Buffer.from(bytes) : null; } catch { return null; } } async function fetchFromUrl(url: string): Promise { try { const response = await fetch(url, { signal: AbortSignal.timeout(10000) }); if (!response.ok) return null; return Buffer.from(await response.arrayBuffer()); } catch { return null; } } async function processBatch( items: T[], getBuffer: (item: T) => Promise, updateFn: (id: number, color: string) => Promise, label: string, ) { const BATCH_SIZE = 10; let processed = 0; let updated = 0; let failed = 0; for (let i = 0; i < items.length; i += BATCH_SIZE) { const batch = items.slice(i, i + BATCH_SIZE); const results = await Promise.allSettled( batch.map(async (item) => { const buffer = await getBuffer(item); if (!buffer) { failed++; return; } const color = await extractColor(buffer); if (!color) { failed++; return; } await updateFn(item.id, color); updated++; }) ); processed += batch.length; console.log(` ${label}: ${processed}/${items.length} processed, ${updated} updated, ${failed} failed`); } } async function main() { console.log("=== Backfill Dominant Colors ===\n"); // Items with imageFilename but no dominantColor const { eq, and, isNotNull } = await import("drizzle-orm"); const itemsToProcess = await db .select({ id: schema.items.id, imageFilename: schema.items.imageFilename }) .from(schema.items) .where(and(isNotNull(schema.items.imageFilename), isNull(schema.items.dominantColor))); console.log(`Items: ${itemsToProcess.length} need processing`); await processBatch( itemsToProcess as { id: number; imageFilename: string }[], (item) => fetchFromS3(item.imageFilename), async (id, color) => { const { eq } = await import("drizzle-orm"); await db.update(schema.items).set({ dominantColor: color }).where(eq(schema.items.id, id)); }, "Items", ); // GlobalItems with imageSourceUrl (external URLs stored in S3) const globalWithFile = await db .select({ id: schema.globalItems.id, imageSourceUrl: schema.globalItems.imageSourceUrl }) .from(schema.globalItems) .where(and(isNotNull(schema.globalItems.imageSourceUrl), isNull(schema.globalItems.dominantColor))); console.log(`\nGlobal Items (with source URL): ${globalWithFile.length} need processing`); await processBatch( globalWithFile as { id: number; imageSourceUrl: string }[], (item) => fetchFromUrl(item.imageSourceUrl), async (id, color) => { const { eq } = await import("drizzle-orm"); await db.update(schema.globalItems).set({ dominantColor: color }).where(eq(schema.globalItems.id, id)); }, "Global Items", ); // GlobalItems with imageUrl (direct URLs) const globalWithUrl = await db .select({ id: schema.globalItems.id, imageUrl: schema.globalItems.imageUrl }) .from(schema.globalItems) .where(and(isNotNull(schema.globalItems.imageUrl), isNull(schema.globalItems.dominantColor))); console.log(`\nGlobal Items (with image URL): ${globalWithUrl.length} need processing`); await processBatch( globalWithUrl as { id: number; imageUrl: string }[], (item) => fetchFromUrl(item.imageUrl), async (id, color) => { const { eq } = await import("drizzle-orm"); await db.update(schema.globalItems).set({ dominantColor: color }).where(eq(schema.globalItems.id, id)); }, "Global Items (URL)", ); // Thread candidates const candidatesToProcess = await db .select({ id: schema.threadCandidates.id, imageFilename: schema.threadCandidates.imageFilename }) .from(schema.threadCandidates) .where(and(isNotNull(schema.threadCandidates.imageFilename), isNull(schema.threadCandidates.dominantColor))); console.log(`\nCandidates: ${candidatesToProcess.length} need processing`); await processBatch( candidatesToProcess as { id: number; imageFilename: string }[], (item) => fetchFromS3(item.imageFilename), async (id, color) => { const { eq } = await import("drizzle-orm"); await db.update(schema.threadCandidates).set({ dominantColor: color }).where(eq(schema.threadCandidates.id, id)); }, "Candidates", ); console.log("\n=== Backfill Complete ==="); process.exit(0); } main().catch((err) => { console.error("Backfill failed:", err); process.exit(1); }); ``` Note: The exact import patterns for drizzle-orm may need adjustment based on the project's existing database connection setup. Check `src/db/` for the actual connection pattern used and replicate it in the script. test -f scripts/backfill-dominant-colors.ts && grep "extractColor" scripts/backfill-dominant-colors.ts && grep "processBatch" scripts/backfill-dominant-colors.ts && echo "PASS" || echo "FAIL" - `scripts/backfill-dominant-colors.ts` exists - Script queries items, globalItems, threadCandidates with images but no dominantColor - Processes in batches of 10 concurrent - Extracts dominant color via Sharp resize(1,1) - Updates database records with extracted color - Skips records that already have dominantColor (idempotent) - Logs progress: `Items: 45/123 processed, 42 updated, 3 failed` - Handles errors gracefully (skips failed images, logs them) - Exits with 0 on success, 1 on fatal error ### Task 2: Add npm script for backfill - package.json Add to `scripts` section in `package.json`: ```json "backfill:colors": "bun run scripts/backfill-dominant-colors.ts" ``` grep "backfill:colors" package.json && echo "PASS" || echo "FAIL" - package.json contains `"backfill:colors"` script - Script points to `scripts/backfill-dominant-colors.ts` 1. `bun run lint` passes (script follows project conventions) 2. Script is syntactically valid: `bun run scripts/backfill-dominant-colors.ts --help` or `bun check scripts/backfill-dominant-colors.ts` 3. Script handles missing S3 credentials gracefully (error message, not crash) - Backfill script exists and processes all 3 tables - Script is idempotent (safe to re-run) - Batch processing limits concurrency to 10 - Progress logging shows processing status - npm script shortcut available | Threat | Severity | Mitigation | |--------|----------|------------| | S3 credential exposure in script | Low | Uses env vars from process.env, no hardcoded credentials | | SSRF via globalItems imageUrl | Medium | Script only processes URLs already stored in the database (previously validated on ingestion); fetch has 10s timeout | | Database overload from bulk updates | Low | Batch size of 10 limits concurrent DB writes | - [ ] Backfill script at scripts/backfill-dominant-colors.ts - [ ] Processes items, globalItems, threadCandidates - [ ] Idempotent (skips existing dominantColor) - [ ] Batch processing with concurrency limit - [ ] Progress logging - [ ] npm script shortcut