272 lines
9.4 KiB
Markdown
272 lines
9.4 KiB
Markdown
---
|
|
phase: 29
|
|
plan: 04
|
|
type: backend
|
|
wave: 2
|
|
depends_on: [01]
|
|
files_modified:
|
|
- scripts/backfill-dominant-colors.ts
|
|
autonomous: true
|
|
requirements: []
|
|
---
|
|
|
|
<objective>
|
|
Create a one-time backfill script that processes all existing images in the database to extract and store their dominant color. Handles items, globalItems, and threadCandidates with imageFilename, plus globalItems with external imageUrl.
|
|
</objective>
|
|
|
|
<tasks>
|
|
|
|
### Task 1: Create backfill script
|
|
<task type="code">
|
|
<read_first>
|
|
- src/db/schema.ts
|
|
- src/server/services/storage.service.ts
|
|
- src/server/services/image.service.ts
|
|
</read_first>
|
|
<action>
|
|
Create `scripts/backfill-dominant-colors.ts`:
|
|
|
|
```ts
|
|
/**
|
|
* Backfill dominant colors for all existing images.
|
|
* Run with: bun run scripts/backfill-dominant-colors.ts
|
|
*
|
|
* Idempotent — skips records that already have dominantColor set.
|
|
* Processes in batches of 10 concurrent requests.
|
|
*/
|
|
|
|
import { GetObjectCommand, S3Client } from "@aws-sdk/client-s3";
|
|
import { drizzle } from "drizzle-orm/postgres-js";
|
|
import { isNull } from "drizzle-orm";
|
|
import postgres from "postgres";
|
|
import sharp from "sharp";
|
|
import * as schema from "../src/db/schema";
|
|
|
|
const DATABASE_URL = process.env.DATABASE_URL;
|
|
if (!DATABASE_URL) throw new Error("DATABASE_URL required");
|
|
|
|
const client = postgres(DATABASE_URL);
|
|
const db = drizzle(client, { schema });
|
|
|
|
const s3 = new S3Client({
|
|
endpoint: process.env.S3_ENDPOINT,
|
|
region: process.env.S3_REGION ?? "us-east-1",
|
|
credentials: {
|
|
accessKeyId: process.env.S3_ACCESS_KEY!,
|
|
secretAccessKey: process.env.S3_SECRET_KEY!,
|
|
},
|
|
forcePathStyle: true,
|
|
});
|
|
const bucket = process.env.S3_BUCKET ?? "gearbox-images";
|
|
|
|
async function extractColor(buffer: Buffer): Promise<string | null> {
|
|
try {
|
|
const { data } = await sharp(buffer).resize(1, 1).raw().toBuffer({ resolveWithObject: true });
|
|
return `#${data[0].toString(16).padStart(2, "0")}${data[1].toString(16).padStart(2, "0")}${data[2].toString(16).padStart(2, "0")}`;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function fetchFromS3(filename: string): Promise<Buffer | null> {
|
|
try {
|
|
const response = await s3.send(new GetObjectCommand({ Bucket: bucket, Key: filename }));
|
|
const bytes = await response.Body?.transformToByteArray();
|
|
return bytes ? Buffer.from(bytes) : null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function fetchFromUrl(url: string): Promise<Buffer | null> {
|
|
try {
|
|
const response = await fetch(url, { signal: AbortSignal.timeout(10000) });
|
|
if (!response.ok) return null;
|
|
return Buffer.from(await response.arrayBuffer());
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function processBatch<T extends { id: number }>(
|
|
items: T[],
|
|
getBuffer: (item: T) => Promise<Buffer | null>,
|
|
updateFn: (id: number, color: string) => Promise<void>,
|
|
label: string,
|
|
) {
|
|
const BATCH_SIZE = 10;
|
|
let processed = 0;
|
|
let updated = 0;
|
|
let failed = 0;
|
|
|
|
for (let i = 0; i < items.length; i += BATCH_SIZE) {
|
|
const batch = items.slice(i, i + BATCH_SIZE);
|
|
const results = await Promise.allSettled(
|
|
batch.map(async (item) => {
|
|
const buffer = await getBuffer(item);
|
|
if (!buffer) { failed++; return; }
|
|
const color = await extractColor(buffer);
|
|
if (!color) { failed++; return; }
|
|
await updateFn(item.id, color);
|
|
updated++;
|
|
})
|
|
);
|
|
processed += batch.length;
|
|
console.log(` ${label}: ${processed}/${items.length} processed, ${updated} updated, ${failed} failed`);
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
console.log("=== Backfill Dominant Colors ===\n");
|
|
|
|
// Items with imageFilename but no dominantColor
|
|
const { eq, and, isNotNull } = await import("drizzle-orm");
|
|
|
|
const itemsToProcess = await db
|
|
.select({ id: schema.items.id, imageFilename: schema.items.imageFilename })
|
|
.from(schema.items)
|
|
.where(and(isNotNull(schema.items.imageFilename), isNull(schema.items.dominantColor)));
|
|
|
|
console.log(`Items: ${itemsToProcess.length} need processing`);
|
|
await processBatch(
|
|
itemsToProcess as { id: number; imageFilename: string }[],
|
|
(item) => fetchFromS3(item.imageFilename),
|
|
async (id, color) => {
|
|
const { eq } = await import("drizzle-orm");
|
|
await db.update(schema.items).set({ dominantColor: color }).where(eq(schema.items.id, id));
|
|
},
|
|
"Items",
|
|
);
|
|
|
|
// GlobalItems with imageSourceUrl (external URLs stored in S3)
|
|
const globalWithFile = await db
|
|
.select({ id: schema.globalItems.id, imageSourceUrl: schema.globalItems.imageSourceUrl })
|
|
.from(schema.globalItems)
|
|
.where(and(isNotNull(schema.globalItems.imageSourceUrl), isNull(schema.globalItems.dominantColor)));
|
|
|
|
console.log(`\nGlobal Items (with source URL): ${globalWithFile.length} need processing`);
|
|
await processBatch(
|
|
globalWithFile as { id: number; imageSourceUrl: string }[],
|
|
(item) => fetchFromUrl(item.imageSourceUrl),
|
|
async (id, color) => {
|
|
const { eq } = await import("drizzle-orm");
|
|
await db.update(schema.globalItems).set({ dominantColor: color }).where(eq(schema.globalItems.id, id));
|
|
},
|
|
"Global Items",
|
|
);
|
|
|
|
// GlobalItems with imageUrl (direct URLs)
|
|
const globalWithUrl = await db
|
|
.select({ id: schema.globalItems.id, imageUrl: schema.globalItems.imageUrl })
|
|
.from(schema.globalItems)
|
|
.where(and(isNotNull(schema.globalItems.imageUrl), isNull(schema.globalItems.dominantColor)));
|
|
|
|
console.log(`\nGlobal Items (with image URL): ${globalWithUrl.length} need processing`);
|
|
await processBatch(
|
|
globalWithUrl as { id: number; imageUrl: string }[],
|
|
(item) => fetchFromUrl(item.imageUrl),
|
|
async (id, color) => {
|
|
const { eq } = await import("drizzle-orm");
|
|
await db.update(schema.globalItems).set({ dominantColor: color }).where(eq(schema.globalItems.id, id));
|
|
},
|
|
"Global Items (URL)",
|
|
);
|
|
|
|
// Thread candidates
|
|
const candidatesToProcess = await db
|
|
.select({ id: schema.threadCandidates.id, imageFilename: schema.threadCandidates.imageFilename })
|
|
.from(schema.threadCandidates)
|
|
.where(and(isNotNull(schema.threadCandidates.imageFilename), isNull(schema.threadCandidates.dominantColor)));
|
|
|
|
console.log(`\nCandidates: ${candidatesToProcess.length} need processing`);
|
|
await processBatch(
|
|
candidatesToProcess as { id: number; imageFilename: string }[],
|
|
(item) => fetchFromS3(item.imageFilename),
|
|
async (id, color) => {
|
|
const { eq } = await import("drizzle-orm");
|
|
await db.update(schema.threadCandidates).set({ dominantColor: color }).where(eq(schema.threadCandidates.id, id));
|
|
},
|
|
"Candidates",
|
|
);
|
|
|
|
console.log("\n=== Backfill Complete ===");
|
|
process.exit(0);
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("Backfill failed:", err);
|
|
process.exit(1);
|
|
});
|
|
```
|
|
|
|
Note: The exact import patterns for drizzle-orm may need adjustment based on the project's existing database connection setup. Check `src/db/` for the actual connection pattern used and replicate it in the script.
|
|
</action>
|
|
<verify>
|
|
<automated>test -f scripts/backfill-dominant-colors.ts && grep "extractColor" scripts/backfill-dominant-colors.ts && grep "processBatch" scripts/backfill-dominant-colors.ts && echo "PASS" || echo "FAIL"</automated>
|
|
</verify>
|
|
<acceptance_criteria>
|
|
- `scripts/backfill-dominant-colors.ts` exists
|
|
- Script queries items, globalItems, threadCandidates with images but no dominantColor
|
|
- Processes in batches of 10 concurrent
|
|
- Extracts dominant color via Sharp resize(1,1)
|
|
- Updates database records with extracted color
|
|
- Skips records that already have dominantColor (idempotent)
|
|
- Logs progress: `Items: 45/123 processed, 42 updated, 3 failed`
|
|
- Handles errors gracefully (skips failed images, logs them)
|
|
- Exits with 0 on success, 1 on fatal error
|
|
</acceptance_criteria>
|
|
</task>
|
|
|
|
### Task 2: Add npm script for backfill
|
|
<task type="code">
|
|
<read_first>
|
|
- package.json
|
|
</read_first>
|
|
<action>
|
|
Add to `scripts` section in `package.json`:
|
|
```json
|
|
"backfill:colors": "bun run scripts/backfill-dominant-colors.ts"
|
|
```
|
|
</action>
|
|
<verify>
|
|
<automated>grep "backfill:colors" package.json && echo "PASS" || echo "FAIL"</automated>
|
|
</verify>
|
|
<acceptance_criteria>
|
|
- package.json contains `"backfill:colors"` script
|
|
- Script points to `scripts/backfill-dominant-colors.ts`
|
|
</acceptance_criteria>
|
|
</task>
|
|
|
|
</tasks>
|
|
|
|
<verification>
|
|
1. `bun run lint` passes (script follows project conventions)
|
|
2. Script is syntactically valid: `bun run scripts/backfill-dominant-colors.ts --help` or `bun check scripts/backfill-dominant-colors.ts`
|
|
3. Script handles missing S3 credentials gracefully (error message, not crash)
|
|
</verification>
|
|
|
|
<success_criteria>
|
|
- Backfill script exists and processes all 3 tables
|
|
- Script is idempotent (safe to re-run)
|
|
- Batch processing limits concurrency to 10
|
|
- Progress logging shows processing status
|
|
- npm script shortcut available
|
|
</success_criteria>
|
|
|
|
<threat_model>
|
|
| Threat | Severity | Mitigation |
|
|
|--------|----------|------------|
|
|
| S3 credential exposure in script | Low | Uses env vars from process.env, no hardcoded credentials |
|
|
| SSRF via globalItems imageUrl | Medium | Script only processes URLs already stored in the database (previously validated on ingestion); fetch has 10s timeout |
|
|
| Database overload from bulk updates | Low | Batch size of 10 limits concurrent DB writes |
|
|
</threat_model>
|
|
|
|
<must_haves>
|
|
- [ ] Backfill script at scripts/backfill-dominant-colors.ts
|
|
- [ ] Processes items, globalItems, threadCandidates
|
|
- [ ] Idempotent (skips existing dominantColor)
|
|
- [ ] Batch processing with concurrency limit
|
|
- [ ] Progress logging
|
|
- [ ] npm script shortcut
|
|
</must_haves>
|