From 22f2ddaf081445e434ebdd7fc5e983a79156824f Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 12 Nov 2025 03:14:43 +0000 Subject: [PATCH] Keep only the readable Drizzle schema Remove all comparison files and other ORM schemas. Keep only the nicely formatted Drizzle schema with: - Dot-first indented chains - Helper functions for common patterns - Logical grouping with comments - Spread patterns for field groups File renamed from schema.drizzle.readable.ts to schema.drizzle.ts --- orm-comparison/DRIZZLE_FORMATTING_GUIDE.md | 410 ------------- orm-comparison/FORMATTING_COMPARISON.md | 483 ---------------- orm-comparison/MIGRATIONS_AND_TYPES.md | 356 ------------ orm-comparison/README.md | 234 -------- orm-comparison/schema.drizzle.readable.ts | 622 -------------------- orm-comparison/schema.drizzle.ts | 511 +++++++++++++---- orm-comparison/schema.mikroorm.ts | 612 -------------------- orm-comparison/schema.prisma | 282 --------- orm-comparison/schema.typeorm.ts | 634 --------------------- 9 files changed, 394 insertions(+), 3750 deletions(-) delete mode 100644 orm-comparison/DRIZZLE_FORMATTING_GUIDE.md delete mode 100644 orm-comparison/FORMATTING_COMPARISON.md delete mode 100644 orm-comparison/MIGRATIONS_AND_TYPES.md delete mode 100644 orm-comparison/README.md delete mode 100644 orm-comparison/schema.drizzle.readable.ts delete mode 100644 orm-comparison/schema.mikroorm.ts delete mode 100644 orm-comparison/schema.prisma delete mode 100644 orm-comparison/schema.typeorm.ts diff --git a/orm-comparison/DRIZZLE_FORMATTING_GUIDE.md b/orm-comparison/DRIZZLE_FORMATTING_GUIDE.md deleted file mode 100644 index c97aaad7..00000000 --- a/orm-comparison/DRIZZLE_FORMATTING_GUIDE.md +++ /dev/null @@ -1,410 +0,0 @@ -# Making Drizzle Schemas More Readable - -## The Problem - -Drizzle's chained functional syntax can become hard to read: - -```typescript -// ❌ HARD TO READ - Everything crammed together -export const users = pgTable('auth_user', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - username: varchar('username', { length: 150 }).unique().notNull(), - email: varchar('email', { length: 254 }).notNull(), - password: varchar('password', { length: 128 }).notNull(), - first_name: varchar('first_name', { length: 150 }).notNull(), - last_name: varchar('last_name', { length: 150 }).notNull(), - is_active: boolean('is_active').default(true).notNull(), - is_staff: boolean('is_staff').default(false).notNull(), - is_superuser: boolean('is_superuser').default(false).notNull(), - date_joined: timestamp('date_joined', { withTimezone: true }).defaultNow().notNull(), - last_login: timestamp('last_login', { withTimezone: true }), -}, (table) => ({ - usernameIdx: index('auth_user_username_idx').on(table.username), -})); -``` - -## Solution 1: Break Chains Vertically - -```typescript -// ✅ MUCH BETTER - Each modifier on its own line -export const users = pgTable('auth_user', { - id: uuid('id') - .primaryKey() - .$defaultFn(uuidv7Default), - - username: varchar('username', { length: 150 }) - .unique() - .notNull(), - - email: varchar('email', { length: 254 }) - .notNull(), - - is_active: boolean('is_active') - .default(true) - .notNull(), - - date_joined: timestamp('date_joined', { withTimezone: true }) - .defaultNow() - .notNull(), -}); -``` - -**Why it's better:** -- Each modifier is on its own line -- Easy to scan vertically -- Diffs are cleaner (one line = one change) -- Easier to comment out modifiers for testing - -## Solution 2: Group Related Fields - -```typescript -// ✅ EXCELLENT - Logical grouping with comments -export const users = pgTable('auth_user', { - // Primary Key - id: uuid('id') - .primaryKey() - .$defaultFn(uuidv7Default), - - // Core Auth Fields - username: varchar('username', { length: 150 }) - .unique() - .notNull(), - - email: varchar('email', { length: 254 }) - .notNull(), - - password: varchar('password', { length: 128 }) - .notNull(), - - // Profile Fields - first_name: varchar('first_name', { length: 150 }) - .notNull(), - - last_name: varchar('last_name', { length: 150 }) - .notNull(), - - // Permission Flags - is_active: boolean('is_active') - .default(true) - .notNull(), - - is_staff: boolean('is_staff') - .default(false) - .notNull(), - - is_superuser: boolean('is_superuser') - .default(false) - .notNull(), - - // Timestamps - date_joined: timestamp('date_joined', { withTimezone: true }) - .defaultNow() - .notNull(), - - last_login: timestamp('last_login', { withTimezone: true }), -}); -``` - -**Why it's better:** -- Clear sections with comments -- Blank lines separate field groups -- Tells a story about the data structure -- Easier to find specific fields - -## Solution 3: Extract Reusable Helpers - -```typescript -// ✅ BEST - DRY with helper functions -const id_field = () => - uuid('id').primaryKey().$defaultFn(uuidv7Default); - -const abid_field = () => - varchar('abid', { length: 30 }).unique().notNull(); - -const created_at_field = () => - timestamp('created_at', { withTimezone: true }).defaultNow().notNull(); - -const modified_at_field = () => - timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(); - -const notes_field = () => - text('notes').default('').notNull(); - -// Then use them: -export const snapshots = pgTable('core_snapshot', { - // Primary Key & ABID - id: id_field(), - abid: abid_field(), - - // Timestamps - created_at: created_at_field(), - modified_at: modified_at_field(), - - // ... other fields ... - - notes: notes_field(), -}); -``` - -**Why it's better:** -- Reduces repetition dramatically -- Consistent patterns across all tables -- Easy to update common fields -- Self-documenting - -## Solution 4: Use Spread for Common Field Groups - -```typescript -// ✅ EXCELLENT - Spread common patterns -const health_fields = () => ({ - num_uses_failed: integer('num_uses_failed') - .default(0) - .notNull(), - - num_uses_succeeded: integer('num_uses_succeeded') - .default(0) - .notNull(), -}); - -const state_machine_fields = () => ({ - status: varchar('status', { length: 16 }) - .default('queued') - .notNull(), - - retry_at: timestamp('retry_at', { withTimezone: true }) - .defaultNow() - .notNull(), -}); - -// Use them with spread: -export const crawls = pgTable('crawls_crawl', { - id: id_field(), - abid: abid_field(), - - // ... other fields ... - - // State Machine - ...state_machine_fields(), - - // Health Tracking - ...health_fields(), -}); -``` - -**Why it's better:** -- Common patterns defined once -- Less visual clutter -- Easy to see which models have which mixins -- Matches Django's mixin pattern - -## Solution 5: Separate Index Definitions - -```typescript -// ✅ CLEAR - Indexes at the end, not mixed with fields -export const snapshots = pgTable('core_snapshot', { - // All field definitions here... - id: id_field(), - url: text('url').unique().notNull(), - created_at: created_at_field(), - -}, (table) => ({ - // All indexes grouped together - createdAtIdx: index('core_snapshot_created_at_idx') - .on(table.created_at), - - createdByIdx: index('core_snapshot_created_by_idx') - .on(table.created_by_id), - - urlIdx: index('core_snapshot_url_idx') - .on(table.url), - - // Multi-column index example - uniqueObjTag: unique() - .on(table.obj_id, table.name), -})); -``` - -**Why it's better:** -- Fields and indexes are separate concerns -- Can see all indexes at a glance -- Indexes don't clutter field definitions - -## Complete Example: Before vs After - -### Before (Original) -```typescript -export const crawls = pgTable('crawls_crawl', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - abid: varchar('abid', { length: 30 }).unique().notNull(), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - modified_at: timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(), - created_by_id: uuid('created_by_id').notNull().references(() => users.id, { onDelete: 'cascade' }), - seed_id: uuid('seed_id').notNull().references(() => seeds.id, { onDelete: 'restrict' }), - urls: text('urls').default('').notNull(), - config: json('config').default({}).notNull(), - max_depth: smallint('max_depth').default(0).notNull(), - tags_str: varchar('tags_str', { length: 1024 }).default('').notNull(), - persona_id: uuid('persona_id'), - label: varchar('label', { length: 64 }).default('').notNull(), - notes: text('notes').default('').notNull(), - schedule_id: uuid('schedule_id').references(() => crawl_schedules.id, { onDelete: 'set null' }), - status: varchar('status', { length: 16 }).default('queued').notNull(), - retry_at: timestamp('retry_at', { withTimezone: true }).defaultNow().notNull(), - output_dir: varchar('output_dir', { length: 255 }).default('').notNull(), - num_uses_failed: integer('num_uses_failed').default(0).notNull(), - num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), -}, (table) => ({ - createdAtIdx: index('crawls_crawl_created_at_idx').on(table.created_at), - createdByIdx: index('crawls_crawl_created_by_idx').on(table.created_by_id), - seedIdx: index('crawls_crawl_seed_idx').on(table.seed_id), - scheduleIdx: index('crawls_crawl_schedule_idx').on(table.schedule_id), - statusIdx: index('crawls_crawl_status_idx').on(table.status), - retryAtIdx: index('crawls_crawl_retry_at_idx').on(table.retry_at), - abidIdx: index('crawls_crawl_abid_idx').on(table.abid), -})); -``` - -### After (Improved) -```typescript -export const crawls = pgTable('crawls_crawl', { - // Primary Key & ABID - id: id_field(), - abid: abid_field(), - - // Timestamps - created_at: created_at_field(), - modified_at: modified_at_field(), - - // Foreign Keys - created_by_id: uuid('created_by_id') - .notNull() - .references(() => users.id, { onDelete: 'cascade' }), - - seed_id: uuid('seed_id') - .notNull() - .references(() => seeds.id, { onDelete: 'restrict' }), - - schedule_id: uuid('schedule_id') - .references(() => crawl_schedules.id, { onDelete: 'set null' }), - - // Crawl Data - urls: text('urls') - .default('') - .notNull(), - - config: json('config') - .default({}) - .notNull(), - - max_depth: smallint('max_depth') - .default(0) - .notNull(), - - tags_str: varchar('tags_str', { length: 1024 }) - .default('') - .notNull(), - - persona_id: uuid('persona_id'), - - label: varchar('label', { length: 64 }) - .default('') - .notNull(), - - // Storage - output_dir: varchar('output_dir', { length: 255 }) - .default('') - .notNull(), - - // Metadata - notes: notes_field(), - - // State Machine - ...state_machine_fields(), - - // Health Tracking - ...health_fields(), - -}, (table) => ({ - // Indexes - createdAtIdx: index('crawls_crawl_created_at_idx') - .on(table.created_at), - - createdByIdx: index('crawls_crawl_created_by_idx') - .on(table.created_by_id), - - seedIdx: index('crawls_crawl_seed_idx') - .on(table.seed_id), - - scheduleIdx: index('crawls_crawl_schedule_idx') - .on(table.schedule_id), - - statusIdx: index('crawls_crawl_status_idx') - .on(table.status), - - retryAtIdx: index('crawls_crawl_retry_at_idx') - .on(table.retry_at), - - abidIdx: index('crawls_crawl_abid_idx') - .on(table.abid), -})); -``` - -## Line Count Impact - -- **Original**: 345 lines, dense and hard to read -- **Improved**: 380 lines (+10%), but MUCH easier to read -- **Trade-off**: Slightly more lines, but significantly better maintainability - -## Prettier Configuration - -Add to your `.prettierrc.json`: - -```json -{ - "printWidth": 80, - "tabWidth": 2, - "useTabs": false, - "semi": true, - "singleQuote": true, - "trailingComma": "es5", - "bracketSpacing": true, - "arrowParens": "always" -} -``` - -This will help Prettier format Drizzle chains better. - -## IDE Setup - -### VSCode Settings - -Add to `.vscode/settings.json`: - -```json -{ - "editor.formatOnSave": true, - "editor.defaultFormatter": "esbenp.prettier-vscode", - "[typescript]": { - "editor.defaultFormatter": "esbenp.prettier-vscode" - } -} -``` - -## Summary: Best Practices - -1. **Break chains vertically** - One modifier per line -2. **Group related fields** - Use comments and blank lines -3. **Extract helpers** - DRY common patterns -4. **Use spread** - For field groups (like mixins) -5. **Separate concerns** - Fields first, indexes last -6. **Add comments** - Explain sections and complex fields - -## File Structure - -I've created `schema.drizzle.readable.ts` showing all these patterns applied. - -**Compare:** -- `schema.drizzle.ts` - Original (345 lines, dense) -- `schema.drizzle.readable.ts` - Improved (380 lines, clear) - -The readable version is only 10% longer but **infinitely** more maintainable! diff --git a/orm-comparison/FORMATTING_COMPARISON.md b/orm-comparison/FORMATTING_COMPARISON.md deleted file mode 100644 index 81cd2b09..00000000 --- a/orm-comparison/FORMATTING_COMPARISON.md +++ /dev/null @@ -1,483 +0,0 @@ -# Drizzle Formatting: Before vs After - -## The Winning Style: Dot-First Indented Chains - -### ❌ Before (Original - Hard to Read) -```typescript -export const users = pgTable('auth_user', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - username: varchar('username', { length: 150 }).unique().notNull(), - email: varchar('email', { length: 254 }).notNull(), - password: varchar('password', { length: 128 }).notNull(), - first_name: varchar('first_name', { length: 150 }).notNull(), - last_name: varchar('last_name', { length: 150 }).notNull(), - is_active: boolean('is_active').default(true).notNull(), - is_staff: boolean('is_staff').default(false).notNull(), - is_superuser: boolean('is_superuser').default(false).notNull(), - date_joined: timestamp('date_joined', { withTimezone: true }).defaultNow().notNull(), - last_login: timestamp('last_login', { withTimezone: true }), -}); -``` - -**Problems:** -- Everything runs together horizontally -- Hard to see which fields have which modifiers -- Difficult to scan quickly -- Git diffs are noisy (one field change = entire line) - -### ✅ After (Dot-First Indented - Beautiful!) -```typescript -export const users = pgTable('auth_user', { - // Primary Key - id: uuid('id') - .primaryKey() - .$defaultFn(uuidv7Default), - - // Core Auth Fields - username: varchar('username', { length: 150 }) - .unique() - .notNull(), - - email: varchar('email', { length: 254 }) - .notNull(), - - password: varchar('password', { length: 128 }) - .notNull(), - - // Profile Fields - first_name: varchar('first_name', { length: 150 }) - .notNull(), - - last_name: varchar('last_name', { length: 150 }) - .notNull(), - - // Permission Flags - is_active: boolean('is_active') - .default(true) - .notNull(), - - is_staff: boolean('is_staff') - .default(false) - .notNull(), - - is_superuser: boolean('is_superuser') - .default(false) - .notNull(), - - // Timestamps - date_joined: timestamp('date_joined', { withTimezone: true }) - .defaultNow() - .notNull(), - - last_login: timestamp('last_login', { withTimezone: true }), -}); -``` - -**Benefits:** -- ✅ Dots align vertically - easy to scan -- ✅ Each modifier stands alone -- ✅ Clear sections with comments -- ✅ Clean git diffs (one line = one change) -- ✅ Easy to add/remove modifiers - ---- - -## Side-by-Side: Complex Field Example - -### ❌ Before -```typescript -created_by_id: uuid('created_by_id').notNull().references(() => users.id, { onDelete: 'cascade' }), -``` - -### ✅ After -```typescript -created_by_id: uuid('created_by_id') - .notNull() - .references(() => users.id, { onDelete: 'cascade' }), -``` - -**Much clearer!** You can immediately see: -1. It's a UUID field -2. It's required (notNull) -3. It's a foreign key with cascade delete - ---- - -## With Helper Functions: Even Better - -### ❌ Before (Repetitive) -```typescript -export const snapshots = pgTable('core_snapshot', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - abid: varchar('abid', { length: 30 }).unique().notNull(), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - modified_at: timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(), - notes: text('notes').default('').notNull(), - num_uses_failed: integer('num_uses_failed').default(0).notNull(), - num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), - status: varchar('status', { length: 16 }).default('queued').notNull(), - retry_at: timestamp('retry_at', { withTimezone: true }).defaultNow().notNull(), -}); - -export const crawls = pgTable('crawls_crawl', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - abid: varchar('abid', { length: 30 }).unique().notNull(), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - modified_at: timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(), - notes: text('notes').default('').notNull(), - num_uses_failed: integer('num_uses_failed').default(0).notNull(), - num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), - status: varchar('status', { length: 16 }).default('queued').notNull(), - retry_at: timestamp('retry_at', { withTimezone: true }).defaultNow().notNull(), -}); -``` - -### ✅ After (DRY with Helpers) -```typescript -// Define once -const id_field = () => uuid('id') - .primaryKey() - .$defaultFn(uuidv7Default); - -const abid_field = () => varchar('abid', { length: 30 }) - .unique() - .notNull(); - -const created_at_field = () => timestamp('created_at', { withTimezone: true }) - .defaultNow() - .notNull(); - -const modified_at_field = () => timestamp('modified_at', { withTimezone: true }) - .defaultNow() - .notNull(); - -const notes_field = () => text('notes') - .default('') - .notNull(); - -const health_fields = () => ({ - num_uses_failed: integer('num_uses_failed') - .default(0) - .notNull(), - - num_uses_succeeded: integer('num_uses_succeeded') - .default(0) - .notNull(), -}); - -const state_machine_fields = () => ({ - status: varchar('status', { length: 16 }) - .default('queued') - .notNull(), - - retry_at: timestamp('retry_at', { withTimezone: true }) - .defaultNow() - .notNull(), -}); - -// Use everywhere -export const snapshots = pgTable('core_snapshot', { - id: id_field(), - abid: abid_field(), - created_at: created_at_field(), - modified_at: modified_at_field(), - notes: notes_field(), - ...health_fields(), - ...state_machine_fields(), -}); - -export const crawls = pgTable('crawls_crawl', { - id: id_field(), - abid: abid_field(), - created_at: created_at_field(), - modified_at: modified_at_field(), - notes: notes_field(), - ...health_fields(), - ...state_machine_fields(), -}); -``` - -**Wow!** From ~18 lines per table down to ~8 lines per table! - ---- - -## Indexes: Before vs After - -### ❌ Before -```typescript -}, (table) => ({ - createdAtIdx: index('core_snapshot_created_at_idx').on(table.created_at), - createdByIdx: index('core_snapshot_created_by_idx').on(table.created_by_id), - crawlIdx: index('core_snapshot_crawl_idx').on(table.crawl_id), - urlIdx: index('core_snapshot_url_idx').on(table.url), - timestampIdx: index('core_snapshot_timestamp_idx').on(table.timestamp), - bookmarkedAtIdx: index('core_snapshot_bookmarked_at_idx').on(table.bookmarked_at), - downloadedAtIdx: index('core_snapshot_downloaded_at_idx').on(table.downloaded_at), - titleIdx: index('core_snapshot_title_idx').on(table.title), - statusIdx: index('core_snapshot_status_idx').on(table.status), - retryAtIdx: index('core_snapshot_retry_at_idx').on(table.retry_at), - abidIdx: index('core_snapshot_abid_idx').on(table.abid), -})); -``` - -### ✅ After -```typescript -}, (table) => ({ - // Indexes grouped by purpose - - // Foreign Keys - createdByIdx: index('core_snapshot_created_by_idx') - .on(table.created_by_id), - - crawlIdx: index('core_snapshot_crawl_idx') - .on(table.crawl_id), - - // Unique Identifiers - abidIdx: index('core_snapshot_abid_idx') - .on(table.abid), - - urlIdx: index('core_snapshot_url_idx') - .on(table.url), - - timestampIdx: index('core_snapshot_timestamp_idx') - .on(table.timestamp), - - // Temporal Queries - createdAtIdx: index('core_snapshot_created_at_idx') - .on(table.created_at), - - bookmarkedAtIdx: index('core_snapshot_bookmarked_at_idx') - .on(table.bookmarked_at), - - downloadedAtIdx: index('core_snapshot_downloaded_at_idx') - .on(table.downloaded_at), - - // Search Fields - titleIdx: index('core_snapshot_title_idx') - .on(table.title), - - // State Machine - statusIdx: index('core_snapshot_status_idx') - .on(table.status), - - retryAtIdx: index('core_snapshot_retry_at_idx') - .on(table.retry_at), -})); -``` - -**Benefits:** -- Comments explain index purpose -- Vertical alignment is consistent -- Easy to see what's indexed - ---- - -## Real-World Example: Complete Table - -### ❌ Before (Dense, Hard to Read) -```typescript -export const snapshots = pgTable('core_snapshot', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - abid: varchar('abid', { length: 30 }).unique().notNull(), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - modified_at: timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(), - created_by_id: uuid('created_by_id').notNull().references(() => users.id, { onDelete: 'cascade' }), - url: text('url').unique().notNull(), - timestamp: varchar('timestamp', { length: 32 }).unique().notNull(), - bookmarked_at: timestamp('bookmarked_at', { withTimezone: true }).notNull(), - crawl_id: uuid('crawl_id').references(() => crawls.id, { onDelete: 'cascade' }), - title: varchar('title', { length: 512 }), - downloaded_at: timestamp('downloaded_at', { withTimezone: true }), - retry_at: timestamp('retry_at', { withTimezone: true }).defaultNow().notNull(), - status: varchar('status', { length: 16 }).default('queued').notNull(), - config: json('config').default({}).notNull(), - notes: text('notes').default('').notNull(), - output_dir: varchar('output_dir', { length: 255 }), - num_uses_failed: integer('num_uses_failed').default(0).notNull(), - num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), -}, (table) => ({ - createdAtIdx: index('core_snapshot_created_at_idx').on(table.created_at), - createdByIdx: index('core_snapshot_created_by_idx').on(table.created_by_id), - crawlIdx: index('core_snapshot_crawl_idx').on(table.crawl_id), - urlIdx: index('core_snapshot_url_idx').on(table.url), - timestampIdx: index('core_snapshot_timestamp_idx').on(table.timestamp), - bookmarkedAtIdx: index('core_snapshot_bookmarked_at_idx').on(table.bookmarked_at), - downloadedAtIdx: index('core_snapshot_downloaded_at_idx').on(table.downloaded_at), - titleIdx: index('core_snapshot_title_idx').on(table.title), - statusIdx: index('core_snapshot_status_idx').on(table.status), - retryAtIdx: index('core_snapshot_retry_at_idx').on(table.retry_at), - abidIdx: index('core_snapshot_abid_idx').on(table.abid), -})); -``` - -**Line count: 28 lines of dense code** - -### ✅ After (Clear, Organized, Beautiful) -```typescript -export const snapshots = pgTable('core_snapshot', { - // Primary Key & ABID - id: id_field(), - abid: abid_field(), - - // Timestamps - created_at: created_at_field(), - modified_at: modified_at_field(), - - // Foreign Keys - created_by_id: uuid('created_by_id') - .notNull() - .references(() => users.id, { onDelete: 'cascade' }), - - crawl_id: uuid('crawl_id') - .references(() => crawls.id, { onDelete: 'cascade' }), - - // URL Data - url: text('url') - .unique() - .notNull(), - - timestamp: varchar('timestamp', { length: 32 }) - .unique() - .notNull(), - - bookmarked_at: timestamp('bookmarked_at', { withTimezone: true }) - .notNull(), - - // Content Metadata - title: varchar('title', { length: 512 }), - - downloaded_at: timestamp('downloaded_at', { withTimezone: true }), - - config: json('config') - .default({}) - .notNull(), - - // Storage - output_dir: varchar('output_dir', { length: 255 }), - - // Metadata - notes: notes_field(), - - // State Machine - ...state_machine_fields(), - - // Health Tracking - ...health_fields(), - -}, (table) => ({ - // Indexes - createdAtIdx: index('core_snapshot_created_at_idx') - .on(table.created_at), - - createdByIdx: index('core_snapshot_created_by_idx') - .on(table.created_by_id), - - crawlIdx: index('core_snapshot_crawl_idx') - .on(table.crawl_id), - - urlIdx: index('core_snapshot_url_idx') - .on(table.url), - - timestampIdx: index('core_snapshot_timestamp_idx') - .on(table.timestamp), - - bookmarkedAtIdx: index('core_snapshot_bookmarked_at_idx') - .on(table.bookmarked_at), - - downloadedAtIdx: index('core_snapshot_downloaded_at_idx') - .on(table.downloaded_at), - - titleIdx: index('core_snapshot_title_idx') - .on(table.title), - - statusIdx: index('core_snapshot_status_idx') - .on(table.status), - - retryAtIdx: index('core_snapshot_retry_at_idx') - .on(table.retry_at), - - abidIdx: index('core_snapshot_abid_idx') - .on(table.abid), -})); -``` - -**Line count: 77 lines (2.75x longer) but SO MUCH CLEARER!** - ---- - -## The Numbers - -| Metric | Original | Improved | Change | -|--------|----------|----------|--------| -| Total Lines | 345 | 380 | +10% | -| Lines per Field | ~1 | ~2.5 | +150% | -| Readability Score | 3/10 | 10/10 | +233% | -| Maintainability | Hard | Easy | ∞ | -| Git Diff Noise | High | Low | -80% | -| Time to Find Field | Slow | Fast | -70% | - ---- - -## Why Dot-First Wins - -### Visual Alignment -```typescript -// ✅ Dots align - easy to scan down -username: varchar('username', { length: 150 }) - .unique() - .notNull(), - -email: varchar('email', { length: 254 }) - .notNull(), - -password: varchar('password', { length: 128 }) - .notNull(), -``` - -vs - -```typescript -// ❌ Dots all over the place - hard to scan -username: varchar('username', { length: 150 }). - unique(). - notNull(), - -email: varchar('email', { length: 254 }). - notNull(), - -password: varchar('password', { length: 128 }). - notNull(), -``` - -### Clean Git Diffs -```diff -// ✅ Adding .unique() is one clean line - username: varchar('username', { length: 150 }) -+ .unique() - .notNull(), -``` - -vs - -```diff -// ❌ Entire line changes --username: varchar('username', { length: 150 }).notNull(), -+username: varchar('username', { length: 150 }).unique().notNull(), -``` - ---- - -## Final Recommendation - -**Use `schema.drizzle.readable.ts` as your template!** - -It has: -- ✅ Dot-first indented chains -- ✅ Logical grouping with comments -- ✅ Reusable helpers -- ✅ Spread patterns for mixins -- ✅ Separated index definitions - -**Result:** Only 10% more lines but infinitely more maintainable. - -This is the **perfect balance** of Drizzle's power and Prisma's readability! diff --git a/orm-comparison/MIGRATIONS_AND_TYPES.md b/orm-comparison/MIGRATIONS_AND_TYPES.md deleted file mode 100644 index 63c4cebc..00000000 --- a/orm-comparison/MIGRATIONS_AND_TYPES.md +++ /dev/null @@ -1,356 +0,0 @@ -# Automatic Migrations & TypeScript IDE Support Comparison - -## Summary Table - -| ORM | Auto Migration Generation | TypeScript IDE Hints | Winner | -|-----|--------------------------|---------------------|--------| -| **Prisma** | ✅ Excellent | ✅ Excellent (codegen) | 🏆 Best DX | -| **Drizzle** | ✅ Excellent | ✅ **BEST** (no codegen) | 🏆 Best Types | -| **TypeORM** | ✅ Good | ⚠️ Limited | ❌ | -| **MikroORM** | ✅ Very Good | ✅ Good | ✅ | - ---- - -## Detailed Breakdown - -### 1️⃣ Prisma - -#### ✅ Automatic Migrations: EXCELLENT -```bash -# After changing schema.prisma: -npx prisma migrate dev --name add_new_field -# ✅ Automatically generates SQL migration -# ✅ Applies migration to DB -# ✅ Regenerates TypeScript client -``` - -**Pros:** -- Declarative - just edit `.prisma` file -- Generates clean SQL migrations -- Handles complex schema changes well -- Can review/edit SQL before applying - -**Cons:** -- Requires separate schema file (not TypeScript) - -#### ✅ TypeScript IDE Hints: EXCELLENT -```typescript -import { PrismaClient } from '@prisma/client'; -const prisma = new PrismaClient(); - -// 🎯 FULL autocomplete on everything: -const user = await prisma.user.findUnique({ - where: { id: 'some-uuid' }, // ← knows 'id' field exists - include: { - snapshots: true, // ← knows this relation exists - }, -}); - -// user.username // ← IDE knows this is string -// user.snapshots // ← IDE knows this is Snapshot[] -// user.notAField // ← TypeScript ERROR at compile time -``` - -**Pros:** -- Perfect autocomplete on all queries -- Catches typos at compile time -- Infers result types automatically -- Works with any IDE (VSCode, WebStorm, etc.) - -**Cons:** -- Requires running `npx prisma generate` after schema changes -- Generated client can be large (~50MB in node_modules) - ---- - -### 2️⃣ Drizzle - -#### ✅ Automatic Migrations: EXCELLENT -```bash -# After changing schema.drizzle.ts: -npx drizzle-kit generate:pg -# ✅ Automatically generates SQL migration files -# ✅ You review them, then: -npx drizzle-kit push:pg -# ✅ Applies to database -``` - -**Pros:** -- Schema IS TypeScript (no separate file) -- Generates readable SQL migrations -- Git-friendly migration files -- Can edit generated SQL - -**Cons:** -- Two-step process (generate → apply) - -#### ✅ TypeScript IDE Hints: **BEST-IN-CLASS** -```typescript -import { drizzle } from 'drizzle-orm/postgres-js'; -import { users, snapshots } from './schema.drizzle'; - -const db = drizzle(connection); - -// 🎯 PERFECT autocomplete, NO codegen required: -const user = await db - .select() - .from(users) - .where(eq(users.id, 'some-uuid')) - .leftJoin(snapshots, eq(snapshots.created_by_id, users.id)); - -// Type is inferred as: -// { users: typeof users.$inferSelect, snapshots: typeof snapshots.$inferSelect | null }[] - -// user[0].users.username // ← string -// user[0].snapshots?.url // ← string | undefined -// user[0].users.notAField // ← TypeScript ERROR -``` - -**Pros:** -- **Zero codegen** - types come from schema directly -- Best type inference of all ORMs -- Smallest bundle size -- Schema changes = instant type updates -- Autocomplete on table names, columns, relations - -**Cons:** -- None for type safety (this is the gold standard) - ---- - -### 3️⃣ TypeORM - -#### ✅ Automatic Migrations: GOOD -```bash -# After changing entity classes: -npx typeorm migration:generate -n AddNewField -# ✅ Generates migration by comparing entities to DB -# ⚠️ Can be buggy with complex changes - -npx typeorm migration:run -``` - -**Pros:** -- Can generate migrations from entity changes -- Established tool - -**Cons:** -- Auto-generation often needs manual fixes -- Doesn't always detect all changes -- Generated migrations can be messy -- Many devs write migrations manually - -#### ⚠️ TypeScript IDE Hints: LIMITED -```typescript -import { User } from './entities/User'; -import { Repository } from 'typeorm'; - -const userRepo: Repository = connection.getRepository(User); - -// ⚠️ Autocomplete on entity properties only: -const user = await userRepo.findOne({ - where: { id: 'some-uuid' }, // ✅ knows 'id' exists - relations: ['snapshots'], // ❌ 'snapshots' is just a string - no validation! -}); - -// user.username // ✅ IDE knows this is string -// user.snapshots // ✅ IDE knows this is Snapshot[] -// user.notAField // ✅ TypeScript ERROR - -// BUT: -const user2 = await userRepo - .createQueryBuilder('user') - .where('user.id = :id', { id: 'uuid' }) // ❌ 'id' is just a string - no validation! - .leftJoinAndSelect('user.snapshots', 's') // ❌ 'snapshots' not validated! - .getOne(); -// ⚠️ user2 type is just "User | null" - doesn't know snapshots are loaded -``` - -**Pros:** -- Basic entity typing works -- Better than no types - -**Cons:** -- Query strings are not type-checked (huge DX issue) -- Relation names in queries are strings (typos not caught) -- QueryBuilder doesn't infer loaded relations -- Worse type safety than Prisma or Drizzle - ---- - -### 4️⃣ MikroORM - -#### ✅ Automatic Migrations: VERY GOOD -```bash -# After changing entity classes: -npx mikro-orm schema:update --safe -# ✅ Generates migration based on entity changes -# ✅ Better detection than TypeORM -``` - -**Pros:** -- Good auto-generation (better than TypeORM) -- Smart detection of changes -- Safe mode prevents destructive changes - -**Cons:** -- Still occasionally needs manual tweaking - -#### ✅ TypeScript IDE Hints: GOOD -```typescript -import { User } from './entities/User'; -import { MikroORM } from '@mikro-orm/core'; - -const orm = await MikroORM.init({ ... }); -const em = orm.em.fork(); - -// ✅ Good autocomplete with better inference than TypeORM: -const user = await em.findOne(User, - { id: 'some-uuid' }, // ✅ knows 'id' exists - { populate: ['snapshots'] } // ⚠️ Still a string, but has const validation -); - -// user.username // ✅ IDE knows this is string -// user.snapshots // ✅ IDE knows this is Collection -// user.notAField // ✅ TypeScript ERROR - -const users = await em.find(User, { - username: { $like: '%test%' } // ✅ knows 'username' exists -}); -``` - -**Pros:** -- Much better than TypeORM -- Strongly typed entities -- Better QueryBuilder types -- Type-safe filters - -**Cons:** -- Not as good as Prisma's generated client -- Not as good as Drizzle's inference -- Some query methods still use strings - ---- - -## 🏆 Rankings - -### Best Automatic Migrations -1. **Prisma** - Smoothest experience, excellent detection -2. **Drizzle** - Great SQL generation, transparent -3. **MikroORM** - Very good detection -4. **TypeORM** - Works but often needs manual fixes - -### Best TypeScript IDE Hints -1. **Drizzle** 🥇 - Best type inference, zero codegen -2. **Prisma** 🥈 - Perfect types via codegen -3. **MikroORM** 🥉 - Good types, better than TypeORM -4. **TypeORM** - Basic types, many strings not validated - ---- - -## 💡 Recommendations - -### If you prioritize TypeScript IDE experience: -**Choose Drizzle** - Best-in-class type inference without codegen - -### If you want the easiest developer experience overall: -**Choose Prisma** - Great migrations + great types (via codegen) - -### If you need both features to work well: -**Avoid TypeORM** - Weakest typing, especially in queries - -### Middle ground: -**MikroORM** - Both features work well, not as polished as Prisma/Drizzle - ---- - -## Code Examples Side-by-Side - -### Creating a new Snapshot with relations: - -#### Prisma -```typescript -const snapshot = await prisma.snapshot.create({ - data: { - url: 'https://example.com', - timestamp: '1234567890', - created_by: { connect: { id: userId } }, // ← fully typed - crawl: { connect: { id: crawlId } }, // ← fully typed - tags: { - connect: [{ id: tag1Id }, { id: tag2Id }] // ← fully typed - } - }, - include: { - created_by: true, // ← IDE knows this relation exists - tags: true, // ← IDE knows this relation exists - } -}); -// Result type automatically inferred with all included relations -``` - -#### Drizzle -```typescript -const [snapshot] = await db - .insert(snapshots) - .values({ - url: 'https://example.com', - timestamp: '1234567890', - created_by_id: userId, // ← fully typed - crawl_id: crawlId, // ← fully typed - }) - .returning(); - -// For relations, need separate queries or joins: -const snapshotWithRelations = await db - .select() - .from(snapshots) - .leftJoin(users, eq(snapshots.created_by_id, users.id)) - .leftJoin(tags, eq(snapshot_tags.snapshot_id, snapshots.id)) - .where(eq(snapshots.id, snapshot.id)); -// Type fully inferred: { snapshots: Snapshot, users: User | null, tags: Tag | null } -``` - -#### TypeORM -```typescript -const snapshot = snapshotRepo.create({ - url: 'https://example.com', - timestamp: '1234567890', - created_by_id: userId, // ⚠️ Manual FK handling - crawl_id: crawlId, // ⚠️ Manual FK handling -}); -await snapshotRepo.save(snapshot); - -// For relations, need separate loading: -const loaded = await snapshotRepo.findOne({ - where: { id: snapshot.id }, - relations: ['created_by', 'tags'], // ⚠️ strings not validated -}); -``` - -#### MikroORM -```typescript -const snapshot = em.create(Snapshot, { - url: 'https://example.com', - timestamp: '1234567890', - created_by: em.getReference(User, userId), // ✅ typed reference - crawl: em.getReference(Crawl, crawlId), // ✅ typed reference -}); -await em.persistAndFlush(snapshot); - -// Relations auto-loaded with populate: -const loaded = await em.findOne(Snapshot, snapshot.id, { - populate: ['created_by', 'tags'], // ⚠️ still strings -}); -``` - ---- - -## Final Verdict - -**For your use case (migrations + IDE hints):** - -🥇 **Drizzle** - Best types, great migrations, no codegen -🥈 **Prisma** - Great at both, but requires codegen step -🥉 **MikroORM** - Solid at both, more complex patterns -❌ **TypeORM** - Weak typing in queries, avoid for new projects - diff --git a/orm-comparison/README.md b/orm-comparison/README.md deleted file mode 100644 index eb0551f1..00000000 --- a/orm-comparison/README.md +++ /dev/null @@ -1,234 +0,0 @@ -# ArchiveBox Schema ORM Comparison - -This directory contains feature-complete TypeScript ORM schema definitions for the ArchiveBox data model, migrated from Django ORM. All schemas use **snake_case** field names and **UUIDv7** for primary keys to match the existing ArchiveBox conventions. - -## Models Included - -All schemas implement these 8 core models: - -1. **User** - Django's default user model -2. **Tag** - Old-style tags (being phased out) -3. **KVTag** - New key-value tags with generic foreign keys -4. **Seed** - URL sources for crawls -5. **CrawlSchedule** - Scheduled crawl jobs -6. **Crawl** - Individual archiving sessions -7. **Snapshot** - Archived URLs -8. **ArchiveResult** - Extraction results for each snapshot -9. **Outlink** - Links found on pages - -## Line Count Comparison - -| ORM | Lines | Relative Size | -|-----|-------|---------------| -| **Prisma** | 282 | 1.0x (baseline) | -| **Drizzle** | 345 | 1.22x | -| **TypeORM** | 634 | 2.25x | -| **MikroORM** | 612 | 2.17x | - -**Total lines across all schemas: 1,873** - -## Style Comparison - -### Prisma (Most Concise) -- **Declarative DSL** - Custom schema language, not TypeScript -- **Most concise** - ~44% less code than decorator-based ORMs -- **Type-safe client generation** - Generates TypeScript client automatically -- **Limited flexibility** - Schema must fit within DSL constraints -- **Best for**: Rapid development, simple CRUD apps, teams wanting minimal boilerplate - -```prisma -model User { - id String @id @default(uuidv7()) @db.Uuid - username String @unique @db.VarChar(150) - email String @db.VarChar(254) - - snapshots Snapshot[] - - @@map("auth_user") -} -``` - -### Drizzle (SQL-First) -- **TypeScript schema definition** - Uses chainable API -- **SQL-first approach** - Schema closely mirrors SQL DDL -- **22% more code than Prisma** - Still very concise -- **Explicit control** - Fine-grained control over SQL generation -- **Best for**: Developers who want SQL control, migrations via code, minimal magic - -```typescript -export const users = pgTable('auth_user', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - username: varchar('username', { length: 150 }).unique().notNull(), - email: varchar('email', { length: 254 }).notNull(), -}); -``` - -### TypeORM (Decorator-Based) -- **TypeScript decorators** - Java/C# Hibernate-style -- **125% more code than Prisma** - Most verbose of all -- **Active Record or Data Mapper** - Flexible patterns -- **Mature ecosystem** - Oldest and most established -- **Best for**: Enterprise apps, teams familiar with Hibernate, complex business logic - -```typescript -@Entity('auth_user') -export class User { - @PrimaryColumn('uuid') - id: string; - - @Column({ type: 'varchar', length: 150, unique: true }) - username: string; - - @OneToMany(() => Snapshot, snapshot => snapshot.created_by) - snapshots: Snapshot[]; -} -``` - -### MikroORM (Modern Decorator-Based) -- **TypeScript decorators** - Similar to TypeORM but more modern -- **117% more code than Prisma** - Slightly less verbose than TypeORM -- **Unit of Work pattern** - Better performance for batch operations -- **Better TypeScript support** - Stronger type inference than TypeORM -- **Best for**: Complex domains, teams wanting DataMapper pattern, apps with heavy batch operations - -```typescript -@Entity({ tableName: 'auth_user' }) -export class User { - @PrimaryKey({ type: 'uuid' }) - id!: string; - - @Property({ type: 'string', length: 150, unique: true }) - username!: string; - - @OneToMany(() => Snapshot, snapshot => snapshot.created_by) - snapshots = new Collection(this); -} -``` - -## Feature Completeness - -All schemas implement: - -✅ UUIDv7 primary keys -✅ Snake_case field naming (matching Django conventions) -✅ All foreign key relationships with proper cascades -✅ Many-to-many relationships (Snapshot ↔ Tag) -✅ Indexes on all foreign keys and frequently queried fields -✅ Unique constraints (single and composite) -✅ Default values -✅ Nullable fields -✅ JSON/JSONB fields for config storage -✅ Timestamp fields with auto-update -✅ Enum-like status fields - -## Key Differences - -### Schema Definition -- **Prisma**: Separate `.prisma` DSL file -- **Drizzle**: TypeScript with table-based schema -- **TypeORM/MikroORM**: TypeScript classes with decorators - -### Type Safety -- **Prisma**: Generates TypeScript types from schema -- **Drizzle**: Schema IS the types (best inference) -- **TypeORM**: Manual type definitions with decorators -- **MikroORM**: Similar to TypeORM with better inference - -### Migration Strategy -- **Prisma**: Prisma Migrate (declarative) -- **Drizzle**: Drizzle Kit (generates SQL migrations) -- **TypeORM**: TypeORM CLI (can auto-generate) -- **MikroORM**: MikroORM CLI (auto-generates) - -### Query API Style -- **Prisma**: Fluent API (`prisma.user.findMany()`) -- **Drizzle**: SQL-like builders (`db.select().from(users)`) -- **TypeORM**: Repository or QueryBuilder -- **MikroORM**: Repository with Unit of Work - -## Performance Notes - -### Cold Start / Bundle Size -1. **Drizzle** - Smallest runtime, tree-shakeable -2. **Prisma** - Binary engine (separate process) -3. **MikroORM** - Medium size, reflection-based -4. **TypeORM** - Largest runtime - -### Query Performance -All ORMs perform similarly for simple queries. Differences emerge in: -- **Complex queries**: Drizzle and raw SQL excel -- **Batch operations**: MikroORM's Unit of Work is most efficient -- **Relations**: Prisma's query engine is highly optimized -- **Flexibility**: TypeORM/MikroORM allow raw SQL escape hatches - -## Recommendation by Use Case - -| Use Case | Recommended ORM | Why | -|----------|----------------|-----| -| **Rapid MVP** | Prisma | Least code, great DX, auto-migrations | -| **Existing DB** | Drizzle | SQL-first, no magic, easy to integrate | -| **Enterprise App** | TypeORM | Mature, well-documented, large ecosystem | -| **Complex Domain** | MikroORM | Unit of Work, better TypeScript, DDD-friendly | -| **API Performance** | Drizzle | Smallest overhead, tree-shakeable | -| **Type Safety** | Drizzle | Best type inference without codegen | - -## Migration from Django - -All these schemas accurately represent the Django models from: -- `archivebox/core/models.py` - Snapshot, ArchiveResult, Tag -- `archivebox/crawls/models.py` - Seed, Crawl, CrawlSchedule, Outlink -- `archivebox/tags/models.py` - KVTag -- `archivebox/base_models/models.py` - Base model fields (ABID, timestamps, etc.) - -### Notable Django → TypeScript Mappings - -- `models.UUIDField()` → `uuid('id').$defaultFn(uuidv7)` -- `models.CharField(max_length=N)` → `varchar('field', { length: N })` -- `models.TextField()` → `text('field')` -- `models.JSONField()` → `json('field')` or `jsonb('field')` -- `models.DateTimeField()` → `timestamp('field', { withTimezone: true })` -- `models.ForeignKey(onDelete=CASCADE)` → `onDelete: 'cascade'` -- `models.ManyToManyField()` → Many-to-many with junction table - -## Usage Examples - -### Prisma -```bash -npm install prisma @prisma/client -npx prisma generate -npx prisma db push -``` - -### Drizzle -```bash -npm install drizzle-orm postgres -npm install -D drizzle-kit -npx drizzle-kit generate:pg -npx drizzle-kit push:pg -``` - -### TypeORM -```bash -npm install typeorm pg reflect-metadata -npx typeorm migration:generate -npx typeorm migration:run -``` - -### MikroORM -```bash -npm install @mikro-orm/core @mikro-orm/postgresql -npx mikro-orm schema:create -npx mikro-orm schema:update -``` - -## Notes - -- All schemas use PostgreSQL-specific types (`timestamptz`, `jsonb`) -- Junction table for Snapshot-Tag relationship is explicitly defined -- Generic foreign keys (KVTag) require application-level handling in all ORMs -- ABID field handling would need custom logic in TypeScript -- Status machine fields would need additional enum definitions - ---- - -Generated for ArchiveBox schema comparison | All schemas are feature-complete and production-ready diff --git a/orm-comparison/schema.drizzle.readable.ts b/orm-comparison/schema.drizzle.readable.ts deleted file mode 100644 index 870af8f0..00000000 --- a/orm-comparison/schema.drizzle.readable.ts +++ /dev/null @@ -1,622 +0,0 @@ -// ArchiveBox Schema - Drizzle ORM (READABLE VERSION) -// Improved formatting for better readability -// Line count: ~380 lines (slightly longer but MUCH easier to read) - -import { pgTable, uuid, varchar, text, boolean, timestamp, smallint, integer, json, unique, index } from 'drizzle-orm/pg-core'; -import { relations } from 'drizzle-orm'; -import { uuidv7 } from 'uuidv7'; - -// ============================================ -// HELPERS - Reusable field patterns -// ============================================ - -const uuidv7Default = () => uuidv7(); - -// Common field patterns to reduce repetition -const id_field = () => uuid('id').primaryKey().$defaultFn(uuidv7Default); -const abid_field = () => varchar('abid', { length: 30 }).unique().notNull(); -const created_at_field = () => timestamp('created_at', { withTimezone: true }).defaultNow().notNull(); -const modified_at_field = () => timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(); -const notes_field = () => text('notes').default('').notNull(); - -const health_fields = () => ({ - num_uses_failed: integer('num_uses_failed').default(0).notNull(), - num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), -}); - -const state_machine_fields = () => ({ - status: varchar('status', { length: 16 }).default('queued').notNull(), - retry_at: timestamp('retry_at', { withTimezone: true }).defaultNow().notNull(), -}); - -// ============================================ -// USER TABLE -// ============================================ - -export const users = pgTable('auth_user', { - // Primary Key - id: id_field(), - - // Core Auth Fields - username: varchar('username', { length: 150 }) - .unique() - .notNull(), - - email: varchar('email', { length: 254 }) - .notNull(), - - password: varchar('password', { length: 128 }) - .notNull(), - - // Profile Fields - first_name: varchar('first_name', { length: 150 }) - .notNull(), - - last_name: varchar('last_name', { length: 150 }) - .notNull(), - - // Permission Flags - is_active: boolean('is_active') - .default(true) - .notNull(), - - is_staff: boolean('is_staff') - .default(false) - .notNull(), - - is_superuser: boolean('is_superuser') - .default(false) - .notNull(), - - // Timestamps - date_joined: timestamp('date_joined', { withTimezone: true }) - .defaultNow() - .notNull(), - - last_login: timestamp('last_login', { withTimezone: true }), - -}, (table) => ({ - // Indexes - usernameIdx: index('auth_user_username_idx').on(table.username), -})); - -export const usersRelations = relations(users, ({ many }) => ({ - tags: many(tags), - kv_tags: many(kv_tags), - seeds: many(seeds), - crawls: many(crawls), - crawl_schedules: many(crawl_schedules), - snapshots: many(snapshots), - archive_results: many(archive_results), -})); - -// ============================================ -// TAG TABLE (Old-style tags) -// ============================================ - -export const tags = pgTable('core_tag', { - // Primary Key & ABID - id: id_field(), - abid: abid_field(), - - // Timestamps - created_at: created_at_field(), - modified_at: modified_at_field(), - - // Foreign Keys - created_by_id: uuid('created_by_id') - .notNull() - .references(() => users.id, { onDelete: 'cascade' }), - - // Data Fields - name: varchar('name', { length: 100 }) - .unique() - .notNull(), - - slug: varchar('slug', { length: 100 }) - .unique() - .notNull(), - -}, (table) => ({ - // Indexes - createdAtIdx: index('core_tag_created_at_idx').on(table.created_at), - createdByIdx: index('core_tag_created_by_idx').on(table.created_by_id), - abidIdx: index('core_tag_abid_idx').on(table.abid), -})); - -export const tagsRelations = relations(tags, ({ one, many }) => ({ - created_by: one(users, { - fields: [tags.created_by_id], - references: [users.id], - }), - snapshots: many(snapshot_tags), -})); - -// ============================================ -// KVTAG TABLE (Key-value tags) -// ============================================ - -export const kv_tags = pgTable('core_kvtags', { - // Primary Key - id: id_field(), - - // Timestamps - created_at: created_at_field(), - - // Tag Data - name: varchar('name', { length: 255 }) - .notNull(), - - value: text('value'), - - // Generic Foreign Key (handled in app logic) - obj_type: varchar('obj_type', { length: 100 }) - .notNull(), - - obj_id: uuid('obj_id') - .notNull(), - -}, (table) => ({ - // Constraints - uniqueObjTag: unique().on(table.obj_id, table.name), - - // Indexes - createdAtIdx: index('core_kvtags_created_at_idx').on(table.created_at), - objTypeIdx: index('core_kvtags_obj_type_idx').on(table.obj_type), - objIdIdx: index('core_kvtags_obj_id_idx').on(table.obj_id), -})); - -export const kv_tagsRelations = relations(kv_tags, ({ one }) => ({ - // Generic foreign key - handled in application logic -})); - -// ============================================ -// SEED TABLE -// ============================================ - -export const seeds = pgTable('crawls_seed', { - // Primary Key & ABID - id: id_field(), - abid: abid_field(), - - // Timestamps - created_at: created_at_field(), - modified_at: modified_at_field(), - - // Foreign Keys - created_by_id: uuid('created_by_id') - .notNull() - .references(() => users.id, { onDelete: 'cascade' }), - - // Source Configuration - uri: text('uri') - .notNull(), - - extractor: varchar('extractor', { length: 32 }) - .default('auto') - .notNull(), - - tags_str: varchar('tags_str', { length: 255 }) - .default('') - .notNull(), - - label: varchar('label', { length: 255 }) - .default('') - .notNull(), - - config: json('config') - .default({}) - .notNull(), - - // Storage - output_dir: varchar('output_dir', { length: 255 }) - .default('') - .notNull(), - - // Metadata - notes: notes_field(), - - // Health Tracking - ...health_fields(), - -}, (table) => ({ - // Constraints - uniqueUserUriExtractor: unique().on( - table.created_by_id, - table.uri, - table.extractor - ), - uniqueUserLabel: unique().on( - table.created_by_id, - table.label - ), - - // Indexes - createdAtIdx: index('crawls_seed_created_at_idx').on(table.created_at), - createdByIdx: index('crawls_seed_created_by_idx').on(table.created_by_id), - abidIdx: index('crawls_seed_abid_idx').on(table.abid), -})); - -export const seedsRelations = relations(seeds, ({ one, many }) => ({ - created_by: one(users, { - fields: [seeds.created_by_id], - references: [users.id], - }), - crawls: many(crawls), -})); - -// ============================================ -// CRAWL SCHEDULE TABLE -// ============================================ - -export const crawl_schedules = pgTable('crawls_crawlschedule', { - // Primary Key & ABID - id: id_field(), - abid: abid_field(), - - // Timestamps - created_at: created_at_field(), - modified_at: modified_at_field(), - - // Foreign Keys - created_by_id: uuid('created_by_id') - .notNull() - .references(() => users.id, { onDelete: 'cascade' }), - - template_id: uuid('template_id') - .notNull() - .references(() => crawls.id, { onDelete: 'cascade' }), - - // Schedule Configuration - schedule: varchar('schedule', { length: 64 }) - .notNull(), - - is_enabled: boolean('is_enabled') - .default(true) - .notNull(), - - label: varchar('label', { length: 64 }) - .default('') - .notNull(), - - // Metadata - notes: notes_field(), - - // Health Tracking - ...health_fields(), - -}, (table) => ({ - // Indexes - createdAtIdx: index('crawls_crawlschedule_created_at_idx').on(table.created_at), - createdByIdx: index('crawls_crawlschedule_created_by_idx').on(table.created_by_id), - templateIdx: index('crawls_crawlschedule_template_idx').on(table.template_id), - abidIdx: index('crawls_crawlschedule_abid_idx').on(table.abid), -})); - -export const crawl_schedulesRelations = relations(crawl_schedules, ({ one, many }) => ({ - created_by: one(users, { - fields: [crawl_schedules.created_by_id], - references: [users.id], - }), - template: one(crawls, { - fields: [crawl_schedules.template_id], - references: [crawls.id], - }), - crawls: many(crawls), -})); - -// ============================================ -// CRAWL TABLE -// ============================================ - -export const crawls = pgTable('crawls_crawl', { - // Primary Key & ABID - id: id_field(), - abid: abid_field(), - - // Timestamps - created_at: created_at_field(), - modified_at: modified_at_field(), - - // Foreign Keys - created_by_id: uuid('created_by_id') - .notNull() - .references(() => users.id, { onDelete: 'cascade' }), - - seed_id: uuid('seed_id') - .notNull() - .references(() => seeds.id, { onDelete: 'restrict' }), - - schedule_id: uuid('schedule_id') - .references(() => crawl_schedules.id, { onDelete: 'set null' }), - - // Crawl Data - urls: text('urls') - .default('') - .notNull(), - - config: json('config') - .default({}) - .notNull(), - - max_depth: smallint('max_depth') - .default(0) - .notNull(), - - tags_str: varchar('tags_str', { length: 1024 }) - .default('') - .notNull(), - - persona_id: uuid('persona_id'), - - label: varchar('label', { length: 64 }) - .default('') - .notNull(), - - // Storage - output_dir: varchar('output_dir', { length: 255 }) - .default('') - .notNull(), - - // Metadata - notes: notes_field(), - - // State Machine - ...state_machine_fields(), - - // Health Tracking - ...health_fields(), - -}, (table) => ({ - // Indexes - createdAtIdx: index('crawls_crawl_created_at_idx').on(table.created_at), - createdByIdx: index('crawls_crawl_created_by_idx').on(table.created_by_id), - seedIdx: index('crawls_crawl_seed_idx').on(table.seed_id), - scheduleIdx: index('crawls_crawl_schedule_idx').on(table.schedule_id), - statusIdx: index('crawls_crawl_status_idx').on(table.status), - retryAtIdx: index('crawls_crawl_retry_at_idx').on(table.retry_at), - abidIdx: index('crawls_crawl_abid_idx').on(table.abid), -})); - -export const crawlsRelations = relations(crawls, ({ one, many }) => ({ - created_by: one(users, { - fields: [crawls.created_by_id], - references: [users.id], - }), - seed: one(seeds, { - fields: [crawls.seed_id], - references: [seeds.id], - }), - schedule: one(crawl_schedules, { - fields: [crawls.schedule_id], - references: [crawl_schedules.id], - }), - snapshots: many(snapshots), - outlinks: many(outlinks), -})); - -// ============================================ -// SNAPSHOT TABLE -// ============================================ - -export const snapshots = pgTable('core_snapshot', { - // Primary Key & ABID - id: id_field(), - abid: abid_field(), - - // Timestamps - created_at: created_at_field(), - modified_at: modified_at_field(), - - // Foreign Keys - created_by_id: uuid('created_by_id') - .notNull() - .references(() => users.id, { onDelete: 'cascade' }), - - crawl_id: uuid('crawl_id') - .references(() => crawls.id, { onDelete: 'cascade' }), - - // URL Data - url: text('url') - .unique() - .notNull(), - - timestamp: varchar('timestamp', { length: 32 }) - .unique() - .notNull(), - - bookmarked_at: timestamp('bookmarked_at', { withTimezone: true }) - .notNull(), - - // Content Metadata - title: varchar('title', { length: 512 }), - - downloaded_at: timestamp('downloaded_at', { withTimezone: true }), - - config: json('config') - .default({}) - .notNull(), - - // Storage - output_dir: varchar('output_dir', { length: 255 }), - - // Metadata - notes: notes_field(), - - // State Machine - ...state_machine_fields(), - - // Health Tracking - ...health_fields(), - -}, (table) => ({ - // Indexes - createdAtIdx: index('core_snapshot_created_at_idx').on(table.created_at), - createdByIdx: index('core_snapshot_created_by_idx').on(table.created_by_id), - crawlIdx: index('core_snapshot_crawl_idx').on(table.crawl_id), - urlIdx: index('core_snapshot_url_idx').on(table.url), - timestampIdx: index('core_snapshot_timestamp_idx').on(table.timestamp), - bookmarkedAtIdx: index('core_snapshot_bookmarked_at_idx').on(table.bookmarked_at), - downloadedAtIdx: index('core_snapshot_downloaded_at_idx').on(table.downloaded_at), - titleIdx: index('core_snapshot_title_idx').on(table.title), - statusIdx: index('core_snapshot_status_idx').on(table.status), - retryAtIdx: index('core_snapshot_retry_at_idx').on(table.retry_at), - abidIdx: index('core_snapshot_abid_idx').on(table.abid), -})); - -export const snapshotsRelations = relations(snapshots, ({ one, many }) => ({ - created_by: one(users, { - fields: [snapshots.created_by_id], - references: [users.id], - }), - crawl: one(crawls, { - fields: [snapshots.crawl_id], - references: [crawls.id], - }), - tags: many(snapshot_tags), - archive_results: many(archive_results), -})); - -// ============================================ -// ARCHIVE RESULT TABLE -// ============================================ - -export const archive_results = pgTable('core_archiveresult', { - // Primary Key & ABID - id: id_field(), - abid: abid_field(), - - // Timestamps - created_at: created_at_field(), - modified_at: modified_at_field(), - - // Foreign Keys - created_by_id: uuid('created_by_id') - .notNull() - .references(() => users.id, { onDelete: 'cascade' }), - - snapshot_id: uuid('snapshot_id') - .notNull() - .references(() => snapshots.id, { onDelete: 'cascade' }), - - // Extraction Data - extractor: varchar('extractor', { length: 32 }) - .notNull(), - - pwd: varchar('pwd', { length: 256 }), - - cmd: json('cmd'), - - cmd_version: varchar('cmd_version', { length: 128 }), - - output: varchar('output', { length: 1024 }), - - // Execution Timing - start_ts: timestamp('start_ts', { withTimezone: true }), - end_ts: timestamp('end_ts', { withTimezone: true }), - - // Storage - output_dir: varchar('output_dir', { length: 256 }), - - iface_id: uuid('iface_id'), - - // Metadata - notes: notes_field(), - - // State Machine - ...state_machine_fields(), - - // Health Tracking - ...health_fields(), - -}, (table) => ({ - // Indexes - createdAtIdx: index('core_archiveresult_created_at_idx').on(table.created_at), - createdByIdx: index('core_archiveresult_created_by_idx').on(table.created_by_id), - snapshotIdx: index('core_archiveresult_snapshot_idx').on(table.snapshot_id), - extractorIdx: index('core_archiveresult_extractor_idx').on(table.extractor), - statusIdx: index('core_archiveresult_status_idx').on(table.status), - retryAtIdx: index('core_archiveresult_retry_at_idx').on(table.retry_at), - abidIdx: index('core_archiveresult_abid_idx').on(table.abid), -})); - -export const archive_resultsRelations = relations(archive_results, ({ one, many }) => ({ - created_by: one(users, { - fields: [archive_results.created_by_id], - references: [users.id], - }), - snapshot: one(snapshots, { - fields: [archive_results.snapshot_id], - references: [snapshots.id], - }), - outlinks: many(outlinks), -})); - -// ============================================ -// SNAPSHOT TAGS (Junction Table) -// ============================================ - -export const snapshot_tags = pgTable('core_snapshot_tags', { - id: integer('id') - .primaryKey(), - - snapshot_id: uuid('snapshot_id') - .notNull() - .references(() => snapshots.id, { onDelete: 'cascade' }), - - tag_id: uuid('tag_id') - .notNull() - .references(() => tags.id, { onDelete: 'cascade' }), - -}, (table) => ({ - uniqueSnapshotTag: unique().on(table.snapshot_id, table.tag_id), -})); - -export const snapshot_tagsRelations = relations(snapshot_tags, ({ one }) => ({ - snapshot: one(snapshots, { - fields: [snapshot_tags.snapshot_id], - references: [snapshots.id], - }), - tag: one(tags, { - fields: [snapshot_tags.tag_id], - references: [tags.id], - }), -})); - -// ============================================ -// OUTLINK TABLE -// ============================================ - -export const outlinks = pgTable('crawls_outlink', { - // Primary Key - id: id_field(), - - // Link Data - src: text('src') - .notNull(), - - dst: text('dst') - .notNull(), - - // Foreign Keys - crawl_id: uuid('crawl_id') - .notNull() - .references(() => crawls.id, { onDelete: 'cascade' }), - - via_id: uuid('via_id') - .references(() => archive_results.id, { onDelete: 'set null' }), - -}, (table) => ({ - uniqueSrcDstVia: unique().on(table.src, table.dst, table.via_id), -})); - -export const outlinksRelations = relations(outlinks, ({ one }) => ({ - crawl: one(crawls, { - fields: [outlinks.crawl_id], - references: [crawls.id], - }), - via: one(archive_results, { - fields: [outlinks.via_id], - references: [archive_results.id], - }), -})); diff --git a/orm-comparison/schema.drizzle.ts b/orm-comparison/schema.drizzle.ts index 9da30857..870af8f0 100644 --- a/orm-comparison/schema.drizzle.ts +++ b/orm-comparison/schema.drizzle.ts @@ -1,30 +1,82 @@ -// ArchiveBox Schema - Drizzle ORM -// Drizzle uses TypeScript schema definitions with a chainable API -// Line count: ~340 lines +// ArchiveBox Schema - Drizzle ORM (READABLE VERSION) +// Improved formatting for better readability +// Line count: ~380 lines (slightly longer but MUCH easier to read) import { pgTable, uuid, varchar, text, boolean, timestamp, smallint, integer, json, unique, index } from 'drizzle-orm/pg-core'; import { relations } from 'drizzle-orm'; import { uuidv7 } from 'uuidv7'; -// Helper for UUIDv7 default +// ============================================ +// HELPERS - Reusable field patterns +// ============================================ + const uuidv7Default = () => uuidv7(); +// Common field patterns to reduce repetition +const id_field = () => uuid('id').primaryKey().$defaultFn(uuidv7Default); +const abid_field = () => varchar('abid', { length: 30 }).unique().notNull(); +const created_at_field = () => timestamp('created_at', { withTimezone: true }).defaultNow().notNull(); +const modified_at_field = () => timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(); +const notes_field = () => text('notes').default('').notNull(); + +const health_fields = () => ({ + num_uses_failed: integer('num_uses_failed').default(0).notNull(), + num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), +}); + +const state_machine_fields = () => ({ + status: varchar('status', { length: 16 }).default('queued').notNull(), + retry_at: timestamp('retry_at', { withTimezone: true }).defaultNow().notNull(), +}); + // ============================================ -// User Model (Django's default User) +// USER TABLE // ============================================ + export const users = pgTable('auth_user', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - username: varchar('username', { length: 150 }).unique().notNull(), - email: varchar('email', { length: 254 }).notNull(), - password: varchar('password', { length: 128 }).notNull(), - first_name: varchar('first_name', { length: 150 }).notNull(), - last_name: varchar('last_name', { length: 150 }).notNull(), - is_active: boolean('is_active').default(true).notNull(), - is_staff: boolean('is_staff').default(false).notNull(), - is_superuser: boolean('is_superuser').default(false).notNull(), - date_joined: timestamp('date_joined', { withTimezone: true }).defaultNow().notNull(), + // Primary Key + id: id_field(), + + // Core Auth Fields + username: varchar('username', { length: 150 }) + .unique() + .notNull(), + + email: varchar('email', { length: 254 }) + .notNull(), + + password: varchar('password', { length: 128 }) + .notNull(), + + // Profile Fields + first_name: varchar('first_name', { length: 150 }) + .notNull(), + + last_name: varchar('last_name', { length: 150 }) + .notNull(), + + // Permission Flags + is_active: boolean('is_active') + .default(true) + .notNull(), + + is_staff: boolean('is_staff') + .default(false) + .notNull(), + + is_superuser: boolean('is_superuser') + .default(false) + .notNull(), + + // Timestamps + date_joined: timestamp('date_joined', { withTimezone: true }) + .defaultNow() + .notNull(), + last_login: timestamp('last_login', { withTimezone: true }), + }, (table) => ({ + // Indexes usernameIdx: index('auth_user_username_idx').on(table.username), })); @@ -39,17 +91,34 @@ export const usersRelations = relations(users, ({ many }) => ({ })); // ============================================ -// Old-style Tag Model (being phased out) +// TAG TABLE (Old-style tags) // ============================================ + export const tags = pgTable('core_tag', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - abid: varchar('abid', { length: 30 }).unique().notNull(), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - modified_at: timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(), - created_by_id: uuid('created_by_id').notNull().references(() => users.id, { onDelete: 'cascade' }), - name: varchar('name', { length: 100 }).unique().notNull(), - slug: varchar('slug', { length: 100 }).unique().notNull(), + // Primary Key & ABID + id: id_field(), + abid: abid_field(), + + // Timestamps + created_at: created_at_field(), + modified_at: modified_at_field(), + + // Foreign Keys + created_by_id: uuid('created_by_id') + .notNull() + .references(() => users.id, { onDelete: 'cascade' }), + + // Data Fields + name: varchar('name', { length: 100 }) + .unique() + .notNull(), + + slug: varchar('slug', { length: 100 }) + .unique() + .notNull(), + }, (table) => ({ + // Indexes createdAtIdx: index('core_tag_created_at_idx').on(table.created_at), createdByIdx: index('core_tag_created_by_idx').on(table.created_by_id), abidIdx: index('core_tag_abid_idx').on(table.abid), @@ -64,17 +133,34 @@ export const tagsRelations = relations(tags, ({ one, many }) => ({ })); // ============================================ -// New-style KVTag Model (key-value tags) +// KVTAG TABLE (Key-value tags) // ============================================ + export const kv_tags = pgTable('core_kvtags', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - name: varchar('name', { length: 255 }).notNull(), + // Primary Key + id: id_field(), + + // Timestamps + created_at: created_at_field(), + + // Tag Data + name: varchar('name', { length: 255 }) + .notNull(), + value: text('value'), - obj_type: varchar('obj_type', { length: 100 }).notNull(), - obj_id: uuid('obj_id').notNull(), + + // Generic Foreign Key (handled in app logic) + obj_type: varchar('obj_type', { length: 100 }) + .notNull(), + + obj_id: uuid('obj_id') + .notNull(), + }, (table) => ({ + // Constraints uniqueObjTag: unique().on(table.obj_id, table.name), + + // Indexes createdAtIdx: index('core_kvtags_created_at_idx').on(table.created_at), objTypeIdx: index('core_kvtags_obj_type_idx').on(table.obj_type), objIdIdx: index('core_kvtags_obj_id_idx').on(table.obj_id), @@ -85,26 +171,67 @@ export const kv_tagsRelations = relations(kv_tags, ({ one }) => ({ })); // ============================================ -// Seed Model (URL source) +// SEED TABLE // ============================================ + export const seeds = pgTable('crawls_seed', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - abid: varchar('abid', { length: 30 }).unique().notNull(), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - modified_at: timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(), - created_by_id: uuid('created_by_id').notNull().references(() => users.id, { onDelete: 'cascade' }), - uri: text('uri').notNull(), - extractor: varchar('extractor', { length: 32 }).default('auto').notNull(), - tags_str: varchar('tags_str', { length: 255 }).default('').notNull(), - label: varchar('label', { length: 255 }).default('').notNull(), - config: json('config').default({}).notNull(), - output_dir: varchar('output_dir', { length: 255 }).default('').notNull(), - notes: text('notes').default('').notNull(), - num_uses_failed: integer('num_uses_failed').default(0).notNull(), - num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), + // Primary Key & ABID + id: id_field(), + abid: abid_field(), + + // Timestamps + created_at: created_at_field(), + modified_at: modified_at_field(), + + // Foreign Keys + created_by_id: uuid('created_by_id') + .notNull() + .references(() => users.id, { onDelete: 'cascade' }), + + // Source Configuration + uri: text('uri') + .notNull(), + + extractor: varchar('extractor', { length: 32 }) + .default('auto') + .notNull(), + + tags_str: varchar('tags_str', { length: 255 }) + .default('') + .notNull(), + + label: varchar('label', { length: 255 }) + .default('') + .notNull(), + + config: json('config') + .default({}) + .notNull(), + + // Storage + output_dir: varchar('output_dir', { length: 255 }) + .default('') + .notNull(), + + // Metadata + notes: notes_field(), + + // Health Tracking + ...health_fields(), + }, (table) => ({ - uniqueUserUriExtractor: unique().on(table.created_by_id, table.uri, table.extractor), - uniqueUserLabel: unique().on(table.created_by_id, table.label), + // Constraints + uniqueUserUriExtractor: unique().on( + table.created_by_id, + table.uri, + table.extractor + ), + uniqueUserLabel: unique().on( + table.created_by_id, + table.label + ), + + // Indexes createdAtIdx: index('crawls_seed_created_at_idx').on(table.created_at), createdByIdx: index('crawls_seed_created_by_idx').on(table.created_by_id), abidIdx: index('crawls_seed_abid_idx').on(table.abid), @@ -119,22 +246,47 @@ export const seedsRelations = relations(seeds, ({ one, many }) => ({ })); // ============================================ -// CrawlSchedule Model +// CRAWL SCHEDULE TABLE // ============================================ + export const crawl_schedules = pgTable('crawls_crawlschedule', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - abid: varchar('abid', { length: 30 }).unique().notNull(), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - modified_at: timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(), - created_by_id: uuid('created_by_id').notNull().references(() => users.id, { onDelete: 'cascade' }), - template_id: uuid('template_id').notNull().references(() => crawls.id, { onDelete: 'cascade' }), - schedule: varchar('schedule', { length: 64 }).notNull(), - is_enabled: boolean('is_enabled').default(true).notNull(), - label: varchar('label', { length: 64 }).default('').notNull(), - notes: text('notes').default('').notNull(), - num_uses_failed: integer('num_uses_failed').default(0).notNull(), - num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), + // Primary Key & ABID + id: id_field(), + abid: abid_field(), + + // Timestamps + created_at: created_at_field(), + modified_at: modified_at_field(), + + // Foreign Keys + created_by_id: uuid('created_by_id') + .notNull() + .references(() => users.id, { onDelete: 'cascade' }), + + template_id: uuid('template_id') + .notNull() + .references(() => crawls.id, { onDelete: 'cascade' }), + + // Schedule Configuration + schedule: varchar('schedule', { length: 64 }) + .notNull(), + + is_enabled: boolean('is_enabled') + .default(true) + .notNull(), + + label: varchar('label', { length: 64 }) + .default('') + .notNull(), + + // Metadata + notes: notes_field(), + + // Health Tracking + ...health_fields(), + }, (table) => ({ + // Indexes createdAtIdx: index('crawls_crawlschedule_created_at_idx').on(table.created_at), createdByIdx: index('crawls_crawlschedule_created_by_idx').on(table.created_by_id), templateIdx: index('crawls_crawlschedule_template_idx').on(table.template_id), @@ -154,29 +306,69 @@ export const crawl_schedulesRelations = relations(crawl_schedules, ({ one, many })); // ============================================ -// Crawl Model (archiving session) +// CRAWL TABLE // ============================================ + export const crawls = pgTable('crawls_crawl', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - abid: varchar('abid', { length: 30 }).unique().notNull(), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - modified_at: timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(), - created_by_id: uuid('created_by_id').notNull().references(() => users.id, { onDelete: 'cascade' }), - seed_id: uuid('seed_id').notNull().references(() => seeds.id, { onDelete: 'restrict' }), - urls: text('urls').default('').notNull(), - config: json('config').default({}).notNull(), - max_depth: smallint('max_depth').default(0).notNull(), - tags_str: varchar('tags_str', { length: 1024 }).default('').notNull(), + // Primary Key & ABID + id: id_field(), + abid: abid_field(), + + // Timestamps + created_at: created_at_field(), + modified_at: modified_at_field(), + + // Foreign Keys + created_by_id: uuid('created_by_id') + .notNull() + .references(() => users.id, { onDelete: 'cascade' }), + + seed_id: uuid('seed_id') + .notNull() + .references(() => seeds.id, { onDelete: 'restrict' }), + + schedule_id: uuid('schedule_id') + .references(() => crawl_schedules.id, { onDelete: 'set null' }), + + // Crawl Data + urls: text('urls') + .default('') + .notNull(), + + config: json('config') + .default({}) + .notNull(), + + max_depth: smallint('max_depth') + .default(0) + .notNull(), + + tags_str: varchar('tags_str', { length: 1024 }) + .default('') + .notNull(), + persona_id: uuid('persona_id'), - label: varchar('label', { length: 64 }).default('').notNull(), - notes: text('notes').default('').notNull(), - schedule_id: uuid('schedule_id').references(() => crawl_schedules.id, { onDelete: 'set null' }), - status: varchar('status', { length: 16 }).default('queued').notNull(), - retry_at: timestamp('retry_at', { withTimezone: true }).defaultNow().notNull(), - output_dir: varchar('output_dir', { length: 255 }).default('').notNull(), - num_uses_failed: integer('num_uses_failed').default(0).notNull(), - num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), + + label: varchar('label', { length: 64 }) + .default('') + .notNull(), + + // Storage + output_dir: varchar('output_dir', { length: 255 }) + .default('') + .notNull(), + + // Metadata + notes: notes_field(), + + // State Machine + ...state_machine_fields(), + + // Health Tracking + ...health_fields(), + }, (table) => ({ + // Indexes createdAtIdx: index('crawls_crawl_created_at_idx').on(table.created_at), createdByIdx: index('crawls_crawl_created_by_idx').on(table.created_by_id), seedIdx: index('crawls_crawl_seed_idx').on(table.seed_id), @@ -204,28 +396,61 @@ export const crawlsRelations = relations(crawls, ({ one, many }) => ({ })); // ============================================ -// Snapshot Model (archived URL) +// SNAPSHOT TABLE // ============================================ + export const snapshots = pgTable('core_snapshot', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - abid: varchar('abid', { length: 30 }).unique().notNull(), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - modified_at: timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(), - created_by_id: uuid('created_by_id').notNull().references(() => users.id, { onDelete: 'cascade' }), - url: text('url').unique().notNull(), - timestamp: varchar('timestamp', { length: 32 }).unique().notNull(), - bookmarked_at: timestamp('bookmarked_at', { withTimezone: true }).notNull(), - crawl_id: uuid('crawl_id').references(() => crawls.id, { onDelete: 'cascade' }), + // Primary Key & ABID + id: id_field(), + abid: abid_field(), + + // Timestamps + created_at: created_at_field(), + modified_at: modified_at_field(), + + // Foreign Keys + created_by_id: uuid('created_by_id') + .notNull() + .references(() => users.id, { onDelete: 'cascade' }), + + crawl_id: uuid('crawl_id') + .references(() => crawls.id, { onDelete: 'cascade' }), + + // URL Data + url: text('url') + .unique() + .notNull(), + + timestamp: varchar('timestamp', { length: 32 }) + .unique() + .notNull(), + + bookmarked_at: timestamp('bookmarked_at', { withTimezone: true }) + .notNull(), + + // Content Metadata title: varchar('title', { length: 512 }), + downloaded_at: timestamp('downloaded_at', { withTimezone: true }), - retry_at: timestamp('retry_at', { withTimezone: true }).defaultNow().notNull(), - status: varchar('status', { length: 16 }).default('queued').notNull(), - config: json('config').default({}).notNull(), - notes: text('notes').default('').notNull(), + + config: json('config') + .default({}) + .notNull(), + + // Storage output_dir: varchar('output_dir', { length: 255 }), - num_uses_failed: integer('num_uses_failed').default(0).notNull(), - num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), + + // Metadata + notes: notes_field(), + + // State Machine + ...state_machine_fields(), + + // Health Tracking + ...health_fields(), + }, (table) => ({ + // Indexes createdAtIdx: index('core_snapshot_created_at_idx').on(table.created_at), createdByIdx: index('core_snapshot_created_by_idx').on(table.created_by_id), crawlIdx: index('core_snapshot_crawl_idx').on(table.crawl_id), @@ -253,30 +478,59 @@ export const snapshotsRelations = relations(snapshots, ({ one, many }) => ({ })); // ============================================ -// ArchiveResult Model (extraction result) +// ARCHIVE RESULT TABLE // ============================================ + export const archive_results = pgTable('core_archiveresult', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - abid: varchar('abid', { length: 30 }).unique().notNull(), - created_at: timestamp('created_at', { withTimezone: true }).defaultNow().notNull(), - modified_at: timestamp('modified_at', { withTimezone: true }).defaultNow().notNull(), - created_by_id: uuid('created_by_id').notNull().references(() => users.id, { onDelete: 'cascade' }), - snapshot_id: uuid('snapshot_id').notNull().references(() => snapshots.id, { onDelete: 'cascade' }), - extractor: varchar('extractor', { length: 32 }).notNull(), + // Primary Key & ABID + id: id_field(), + abid: abid_field(), + + // Timestamps + created_at: created_at_field(), + modified_at: modified_at_field(), + + // Foreign Keys + created_by_id: uuid('created_by_id') + .notNull() + .references(() => users.id, { onDelete: 'cascade' }), + + snapshot_id: uuid('snapshot_id') + .notNull() + .references(() => snapshots.id, { onDelete: 'cascade' }), + + // Extraction Data + extractor: varchar('extractor', { length: 32 }) + .notNull(), + pwd: varchar('pwd', { length: 256 }), + cmd: json('cmd'), + cmd_version: varchar('cmd_version', { length: 128 }), + output: varchar('output', { length: 1024 }), + + // Execution Timing start_ts: timestamp('start_ts', { withTimezone: true }), end_ts: timestamp('end_ts', { withTimezone: true }), - status: varchar('status', { length: 16 }).default('queued').notNull(), - retry_at: timestamp('retry_at', { withTimezone: true }).defaultNow().notNull(), - notes: text('notes').default('').notNull(), + + // Storage output_dir: varchar('output_dir', { length: 256 }), + iface_id: uuid('iface_id'), - num_uses_failed: integer('num_uses_failed').default(0).notNull(), - num_uses_succeeded: integer('num_uses_succeeded').default(0).notNull(), + + // Metadata + notes: notes_field(), + + // State Machine + ...state_machine_fields(), + + // Health Tracking + ...health_fields(), + }, (table) => ({ + // Indexes createdAtIdx: index('core_archiveresult_created_at_idx').on(table.created_at), createdByIdx: index('core_archiveresult_created_by_idx').on(table.created_by_id), snapshotIdx: index('core_archiveresult_snapshot_idx').on(table.snapshot_id), @@ -299,12 +553,21 @@ export const archive_resultsRelations = relations(archive_results, ({ one, many })); // ============================================ -// SnapshotTag Junction Table +// SNAPSHOT TAGS (Junction Table) // ============================================ + export const snapshot_tags = pgTable('core_snapshot_tags', { - id: integer('id').primaryKey(), - snapshot_id: uuid('snapshot_id').notNull().references(() => snapshots.id, { onDelete: 'cascade' }), - tag_id: uuid('tag_id').notNull().references(() => tags.id, { onDelete: 'cascade' }), + id: integer('id') + .primaryKey(), + + snapshot_id: uuid('snapshot_id') + .notNull() + .references(() => snapshots.id, { onDelete: 'cascade' }), + + tag_id: uuid('tag_id') + .notNull() + .references(() => tags.id, { onDelete: 'cascade' }), + }, (table) => ({ uniqueSnapshotTag: unique().on(table.snapshot_id, table.tag_id), })); @@ -321,14 +584,28 @@ export const snapshot_tagsRelations = relations(snapshot_tags, ({ one }) => ({ })); // ============================================ -// Outlink Model (link found on a page) +// OUTLINK TABLE // ============================================ + export const outlinks = pgTable('crawls_outlink', { - id: uuid('id').primaryKey().$defaultFn(uuidv7Default), - src: text('src').notNull(), - dst: text('dst').notNull(), - crawl_id: uuid('crawl_id').notNull().references(() => crawls.id, { onDelete: 'cascade' }), - via_id: uuid('via_id').references(() => archive_results.id, { onDelete: 'set null' }), + // Primary Key + id: id_field(), + + // Link Data + src: text('src') + .notNull(), + + dst: text('dst') + .notNull(), + + // Foreign Keys + crawl_id: uuid('crawl_id') + .notNull() + .references(() => crawls.id, { onDelete: 'cascade' }), + + via_id: uuid('via_id') + .references(() => archive_results.id, { onDelete: 'set null' }), + }, (table) => ({ uniqueSrcDstVia: unique().on(table.src, table.dst, table.via_id), })); diff --git a/orm-comparison/schema.mikroorm.ts b/orm-comparison/schema.mikroorm.ts deleted file mode 100644 index 4d2d64f5..00000000 --- a/orm-comparison/schema.mikroorm.ts +++ /dev/null @@ -1,612 +0,0 @@ -// ArchiveBox Schema - MikroORM -// MikroORM uses TypeScript decorators similar to TypeORM but with different patterns -// Line count: ~570 lines - -import { - Entity, - PrimaryKey, - Property, - ManyToOne, - OneToMany, - ManyToMany, - Collection, - Index, - Unique, - BeforeCreate, -} from '@mikro-orm/core'; -import { uuidv7 } from 'uuidv7'; - -// ============================================ -// User Entity (Django's default User) -// ============================================ -@Entity({ tableName: 'auth_user' }) -@Index({ properties: ['username'] }) -export class User { - @PrimaryKey({ type: 'uuid' }) - id!: string; - - @Property({ type: 'string', length: 150, unique: true }) - username!: string; - - @Property({ type: 'string', length: 254 }) - email!: string; - - @Property({ type: 'string', length: 128 }) - password!: string; - - @Property({ type: 'string', length: 150 }) - first_name!: string; - - @Property({ type: 'string', length: 150 }) - last_name!: string; - - @Property({ type: 'boolean', default: true }) - is_active = true; - - @Property({ type: 'boolean', default: false }) - is_staff = false; - - @Property({ type: 'boolean', default: false }) - is_superuser = false; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - date_joined!: Date; - - @Property({ type: 'timestamptz', nullable: true }) - last_login?: Date; - - // Relations - @OneToMany(() => Tag, tag => tag.created_by) - tags = new Collection(this); - - @OneToMany(() => KVTag, kvTag => kvTag.created_by) - kv_tags = new Collection(this); - - @OneToMany(() => Seed, seed => seed.created_by) - seeds = new Collection(this); - - @OneToMany(() => Crawl, crawl => crawl.created_by) - crawls = new Collection(this); - - @OneToMany(() => CrawlSchedule, schedule => schedule.created_by) - crawl_schedules = new Collection(this); - - @OneToMany(() => Snapshot, snapshot => snapshot.created_by) - snapshots = new Collection(this); - - @OneToMany(() => ArchiveResult, result => result.created_by) - archive_results = new Collection(this); - - @BeforeCreate() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// Tag Entity (being phased out) -// ============================================ -@Entity({ tableName: 'core_tag' }) -@Index({ properties: ['created_at'] }) -@Index({ properties: ['created_by_id'] }) -@Index({ properties: ['abid'] }) -export class Tag { - @PrimaryKey({ type: 'uuid' }) - id!: string; - - @Property({ type: 'string', length: 30, unique: true }) - abid!: string; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - created_at!: Date; - - @Property({ type: 'timestamptz', onUpdate: () => new Date() }) - modified_at!: Date; - - @Property({ type: 'uuid', persist: false }) - created_by_id!: string; - - @Property({ type: 'string', length: 100, unique: true }) - name!: string; - - @Property({ type: 'string', length: 100, unique: true }) - slug!: string; - - // Relations - @ManyToOne(() => User, { onDelete: 'cascade', fieldName: 'created_by_id' }) - created_by!: User; - - @ManyToMany(() => Snapshot, snapshot => snapshot.tags) - snapshots = new Collection(this); - - @BeforeCreate() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// KVTag Entity (key-value tags) -// ============================================ -@Entity({ tableName: 'core_kvtags' }) -@Unique({ properties: ['obj_id', 'name'] }) -@Index({ properties: ['created_at'] }) -@Index({ properties: ['obj_type'] }) -@Index({ properties: ['obj_id'] }) -export class KVTag { - @PrimaryKey({ type: 'uuid' }) - id!: string; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - created_at!: Date; - - @Property({ type: 'string', length: 255 }) - name!: string; - - @Property({ type: 'text', nullable: true }) - value?: string; - - @Property({ type: 'string', length: 100 }) - obj_type!: string; - - @Property({ type: 'uuid' }) - obj_id!: string; - - @Property({ type: 'uuid', persist: false }) - created_by_id!: string; - - // Relations - @ManyToOne(() => User, { onDelete: 'cascade', fieldName: 'created_by_id' }) - created_by!: User; - - @BeforeCreate() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// Seed Entity -// ============================================ -@Entity({ tableName: 'crawls_seed' }) -@Unique({ properties: ['created_by_id', 'uri', 'extractor'] }) -@Unique({ properties: ['created_by_id', 'label'] }) -@Index({ properties: ['created_at'] }) -@Index({ properties: ['created_by_id'] }) -@Index({ properties: ['abid'] }) -export class Seed { - @PrimaryKey({ type: 'uuid' }) - id!: string; - - @Property({ type: 'string', length: 30, unique: true }) - abid!: string; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - created_at!: Date; - - @Property({ type: 'timestamptz', onUpdate: () => new Date() }) - modified_at!: Date; - - @Property({ type: 'uuid', persist: false }) - created_by_id!: string; - - @Property({ type: 'text' }) - uri!: string; - - @Property({ type: 'string', length: 32, default: 'auto' }) - extractor = 'auto'; - - @Property({ type: 'string', length: 255, default: '' }) - tags_str = ''; - - @Property({ type: 'string', length: 255, default: '' }) - label = ''; - - @Property({ type: 'json', default: {} }) - config: object = {}; - - @Property({ type: 'string', length: 255, default: '' }) - output_dir = ''; - - @Property({ type: 'text', default: '' }) - notes = ''; - - @Property({ type: 'integer', default: 0 }) - num_uses_failed = 0; - - @Property({ type: 'integer', default: 0 }) - num_uses_succeeded = 0; - - // Relations - @ManyToOne(() => User, { onDelete: 'cascade', fieldName: 'created_by_id' }) - created_by!: User; - - @OneToMany(() => Crawl, crawl => crawl.seed) - crawls = new Collection(this); - - @BeforeCreate() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// CrawlSchedule Entity -// ============================================ -@Entity({ tableName: 'crawls_crawlschedule' }) -@Index({ properties: ['created_at'] }) -@Index({ properties: ['created_by_id'] }) -@Index({ properties: ['template_id'] }) -@Index({ properties: ['abid'] }) -export class CrawlSchedule { - @PrimaryKey({ type: 'uuid' }) - id!: string; - - @Property({ type: 'string', length: 30, unique: true }) - abid!: string; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - created_at!: Date; - - @Property({ type: 'timestamptz', onUpdate: () => new Date() }) - modified_at!: Date; - - @Property({ type: 'uuid', persist: false }) - created_by_id!: string; - - @Property({ type: 'uuid', persist: false }) - template_id!: string; - - @Property({ type: 'string', length: 64 }) - schedule!: string; - - @Property({ type: 'boolean', default: true }) - is_enabled = true; - - @Property({ type: 'string', length: 64, default: '' }) - label = ''; - - @Property({ type: 'text', default: '' }) - notes = ''; - - @Property({ type: 'integer', default: 0 }) - num_uses_failed = 0; - - @Property({ type: 'integer', default: 0 }) - num_uses_succeeded = 0; - - // Relations - @ManyToOne(() => User, { onDelete: 'cascade', fieldName: 'created_by_id' }) - created_by!: User; - - @ManyToOne(() => Crawl, { onDelete: 'cascade', fieldName: 'template_id' }) - template!: Crawl; - - @OneToMany(() => Crawl, crawl => crawl.schedule) - crawls = new Collection(this); - - @BeforeCreate() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// Crawl Entity -// ============================================ -@Entity({ tableName: 'crawls_crawl' }) -@Index({ properties: ['created_at'] }) -@Index({ properties: ['created_by_id'] }) -@Index({ properties: ['seed_id'] }) -@Index({ properties: ['schedule_id'] }) -@Index({ properties: ['status'] }) -@Index({ properties: ['retry_at'] }) -@Index({ properties: ['abid'] }) -export class Crawl { - @PrimaryKey({ type: 'uuid' }) - id!: string; - - @Property({ type: 'string', length: 30, unique: true }) - abid!: string; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - created_at!: Date; - - @Property({ type: 'timestamptz', onUpdate: () => new Date() }) - modified_at!: Date; - - @Property({ type: 'uuid', persist: false }) - created_by_id!: string; - - @Property({ type: 'uuid', persist: false }) - seed_id!: string; - - @Property({ type: 'text', default: '' }) - urls = ''; - - @Property({ type: 'json', default: {} }) - config: object = {}; - - @Property({ type: 'smallint', default: 0 }) - max_depth = 0; - - @Property({ type: 'string', length: 1024, default: '' }) - tags_str = ''; - - @Property({ type: 'uuid', nullable: true }) - persona_id?: string; - - @Property({ type: 'string', length: 64, default: '' }) - label = ''; - - @Property({ type: 'text', default: '' }) - notes = ''; - - @Property({ type: 'uuid', nullable: true, persist: false }) - schedule_id?: string; - - @Property({ type: 'string', length: 16, default: 'queued' }) - status = 'queued'; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - retry_at!: Date; - - @Property({ type: 'string', length: 255, default: '' }) - output_dir = ''; - - @Property({ type: 'integer', default: 0 }) - num_uses_failed = 0; - - @Property({ type: 'integer', default: 0 }) - num_uses_succeeded = 0; - - // Relations - @ManyToOne(() => User, { onDelete: 'cascade', fieldName: 'created_by_id' }) - created_by!: User; - - @ManyToOne(() => Seed, { onDelete: 'restrict', fieldName: 'seed_id' }) - seed!: Seed; - - @ManyToOne(() => CrawlSchedule, { onDelete: 'set null', nullable: true, fieldName: 'schedule_id' }) - schedule?: CrawlSchedule; - - @OneToMany(() => Snapshot, snapshot => snapshot.crawl) - snapshots = new Collection(this); - - @OneToMany(() => Outlink, outlink => outlink.crawl) - outlinks = new Collection(this); - - @BeforeCreate() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// Snapshot Entity -// ============================================ -@Entity({ tableName: 'core_snapshot' }) -@Index({ properties: ['created_at'] }) -@Index({ properties: ['created_by_id'] }) -@Index({ properties: ['crawl_id'] }) -@Index({ properties: ['url'] }) -@Index({ properties: ['timestamp'] }) -@Index({ properties: ['bookmarked_at'] }) -@Index({ properties: ['downloaded_at'] }) -@Index({ properties: ['title'] }) -@Index({ properties: ['status'] }) -@Index({ properties: ['retry_at'] }) -@Index({ properties: ['abid'] }) -export class Snapshot { - @PrimaryKey({ type: 'uuid' }) - id!: string; - - @Property({ type: 'string', length: 30, unique: true }) - abid!: string; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - created_at!: Date; - - @Property({ type: 'timestamptz', onUpdate: () => new Date() }) - modified_at!: Date; - - @Property({ type: 'uuid', persist: false }) - created_by_id!: string; - - @Property({ type: 'text', unique: true }) - url!: string; - - @Property({ type: 'string', length: 32, unique: true }) - timestamp!: string; - - @Property({ type: 'timestamptz' }) - bookmarked_at!: Date; - - @Property({ type: 'uuid', nullable: true, persist: false }) - crawl_id?: string; - - @Property({ type: 'string', length: 512, nullable: true }) - title?: string; - - @Property({ type: 'timestamptz', nullable: true }) - downloaded_at?: Date; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - retry_at!: Date; - - @Property({ type: 'string', length: 16, default: 'queued' }) - status = 'queued'; - - @Property({ type: 'json', default: {} }) - config: object = {}; - - @Property({ type: 'text', default: '' }) - notes = ''; - - @Property({ type: 'string', length: 255, nullable: true }) - output_dir?: string; - - @Property({ type: 'integer', default: 0 }) - num_uses_failed = 0; - - @Property({ type: 'integer', default: 0 }) - num_uses_succeeded = 0; - - // Relations - @ManyToOne(() => User, { onDelete: 'cascade', fieldName: 'created_by_id' }) - created_by!: User; - - @ManyToOne(() => Crawl, { onDelete: 'cascade', nullable: true, fieldName: 'crawl_id' }) - crawl?: Crawl; - - @ManyToMany(() => Tag, tag => tag.snapshots, { owner: true, pivotTable: 'core_snapshot_tags' }) - tags = new Collection(this); - - @OneToMany(() => ArchiveResult, result => result.snapshot) - archive_results = new Collection(this); - - @BeforeCreate() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// ArchiveResult Entity -// ============================================ -@Entity({ tableName: 'core_archiveresult' }) -@Index({ properties: ['created_at'] }) -@Index({ properties: ['created_by_id'] }) -@Index({ properties: ['snapshot_id'] }) -@Index({ properties: ['extractor'] }) -@Index({ properties: ['status'] }) -@Index({ properties: ['retry_at'] }) -@Index({ properties: ['abid'] }) -export class ArchiveResult { - @PrimaryKey({ type: 'uuid' }) - id!: string; - - @Property({ type: 'string', length: 30, unique: true }) - abid!: string; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - created_at!: Date; - - @Property({ type: 'timestamptz', onUpdate: () => new Date() }) - modified_at!: Date; - - @Property({ type: 'uuid', persist: false }) - created_by_id!: string; - - @Property({ type: 'uuid', persist: false }) - snapshot_id!: string; - - @Property({ type: 'string', length: 32 }) - extractor!: string; - - @Property({ type: 'string', length: 256, nullable: true }) - pwd?: string; - - @Property({ type: 'json', nullable: true }) - cmd?: object; - - @Property({ type: 'string', length: 128, nullable: true }) - cmd_version?: string; - - @Property({ type: 'string', length: 1024, nullable: true }) - output?: string; - - @Property({ type: 'timestamptz', nullable: true }) - start_ts?: Date; - - @Property({ type: 'timestamptz', nullable: true }) - end_ts?: Date; - - @Property({ type: 'string', length: 16, default: 'queued' }) - status = 'queued'; - - @Property({ type: 'timestamptz', onCreate: () => new Date() }) - retry_at!: Date; - - @Property({ type: 'text', default: '' }) - notes = ''; - - @Property({ type: 'string', length: 256, nullable: true }) - output_dir?: string; - - @Property({ type: 'uuid', nullable: true }) - iface_id?: string; - - @Property({ type: 'integer', default: 0 }) - num_uses_failed = 0; - - @Property({ type: 'integer', default: 0 }) - num_uses_succeeded = 0; - - // Relations - @ManyToOne(() => User, { onDelete: 'cascade', fieldName: 'created_by_id' }) - created_by!: User; - - @ManyToOne(() => Snapshot, { onDelete: 'cascade', fieldName: 'snapshot_id' }) - snapshot!: Snapshot; - - @OneToMany(() => Outlink, outlink => outlink.via) - outlinks = new Collection(this); - - @BeforeCreate() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// Outlink Entity -// ============================================ -@Entity({ tableName: 'crawls_outlink' }) -@Unique({ properties: ['src', 'dst', 'via_id'] }) -export class Outlink { - @PrimaryKey({ type: 'uuid' }) - id!: string; - - @Property({ type: 'text' }) - src!: string; - - @Property({ type: 'text' }) - dst!: string; - - @Property({ type: 'uuid', persist: false }) - crawl_id!: string; - - @Property({ type: 'uuid', nullable: true, persist: false }) - via_id?: string; - - // Relations - @ManyToOne(() => Crawl, { onDelete: 'cascade', fieldName: 'crawl_id' }) - crawl!: Crawl; - - @ManyToOne(() => ArchiveResult, { onDelete: 'set null', nullable: true, fieldName: 'via_id' }) - via?: ArchiveResult; - - @BeforeCreate() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} diff --git a/orm-comparison/schema.prisma b/orm-comparison/schema.prisma deleted file mode 100644 index 9103f989..00000000 --- a/orm-comparison/schema.prisma +++ /dev/null @@ -1,282 +0,0 @@ -// ArchiveBox Schema - Prisma ORM -// Prisma uses a declarative schema DSL -// Line count: ~280 lines - -datasource db { - provider = "postgresql" - url = env("DATABASE_URL") -} - -generator client { - provider = "prisma-client-js" - previewFeatures = ["uuidv7"] -} - -// ============================================ -// User Model (Django's default User) -// ============================================ -model User { - id String @id @default(uuidv7()) @db.Uuid - username String @unique @db.VarChar(150) - email String @db.VarChar(254) - password String @db.VarChar(128) - first_name String @db.VarChar(150) - last_name String @db.VarChar(150) - is_active Boolean @default(true) - is_staff Boolean @default(false) - is_superuser Boolean @default(false) - date_joined DateTime @default(now()) - last_login DateTime? - - // Relations - tags Tag[] - kv_tags KVTag[] - seeds Seed[] - crawls Crawl[] - crawl_schedules CrawlSchedule[] - snapshots Snapshot[] - archive_results ArchiveResult[] - - @@map("auth_user") -} - -// ============================================ -// Old-style Tag Model (being phased out) -// ============================================ -model Tag { - id String @id @default(uuidv7()) @db.Uuid - abid String @unique @db.VarChar(30) - created_at DateTime @default(now()) @db.Timestamptz - modified_at DateTime @updatedAt @db.Timestamptz - created_by_id String @db.Uuid - name String @unique @db.VarChar(100) - slug String @unique @db.VarChar(100) - - // Relations - created_by User @relation(fields: [created_by_id], references: [id], onDelete: Cascade) - snapshots Snapshot[] @relation("SnapshotTags") - - @@index([created_at]) - @@index([created_by_id]) - @@map("core_tag") -} - -// ============================================ -// New-style KVTag Model (key-value tags) -// ============================================ -model KVTag { - id String @id @default(uuidv7()) @db.Uuid - created_at DateTime @default(now()) @db.Timestamptz - name String @db.VarChar(255) - value String? @db.Text - obj_type String @db.VarChar(100) - obj_id String @db.Uuid - created_by_id String @db.Uuid - - // Relations - created_by User @relation(fields: [created_by_id], references: [id], onDelete: Cascade) - - @@unique([obj_id, name]) - @@index([created_at]) - @@index([obj_type]) - @@index([obj_id]) - @@map("core_kvtags") -} - -// ============================================ -// Seed Model (URL source) -// ============================================ -model Seed { - id String @id @default(uuidv7()) @db.Uuid - abid String @unique @db.VarChar(30) - created_at DateTime @default(now()) @db.Timestamptz - modified_at DateTime @updatedAt @db.Timestamptz - created_by_id String @db.Uuid - uri String @db.Text - extractor String @default("auto") @db.VarChar(32) - tags_str String @default("") @db.VarChar(255) - label String @default("") @db.VarChar(255) - config Json @default("{}") - output_dir String @default("") @db.VarChar(255) - notes String @default("") @db.Text - num_uses_failed Int @default(0) - num_uses_succeeded Int @default(0) - - // Relations - created_by User @relation(fields: [created_by_id], references: [id], onDelete: Cascade) - crawls Crawl[] - - @@unique([created_by_id, uri, extractor]) - @@unique([created_by_id, label]) - @@index([created_at]) - @@index([created_by_id]) - @@map("crawls_seed") -} - -// ============================================ -// CrawlSchedule Model -// ============================================ -model CrawlSchedule { - id String @id @default(uuidv7()) @db.Uuid - abid String @unique @db.VarChar(30) - created_at DateTime @default(now()) @db.Timestamptz - modified_at DateTime @updatedAt @db.Timestamptz - created_by_id String @db.Uuid - template_id String @db.Uuid - schedule String @db.VarChar(64) - is_enabled Boolean @default(true) - label String @default("") @db.VarChar(64) - notes String @default("") @db.Text - num_uses_failed Int @default(0) - num_uses_succeeded Int @default(0) - - // Relations - created_by User @relation(fields: [created_by_id], references: [id], onDelete: Cascade) - template Crawl @relation("CrawlScheduleTemplate", fields: [template_id], references: [id], onDelete: Cascade) - crawls Crawl[] @relation("ScheduledCrawls") - - @@index([created_at]) - @@index([created_by_id]) - @@map("crawls_crawlschedule") -} - -// ============================================ -// Crawl Model (archiving session) -// ============================================ -model Crawl { - id String @id @default(uuidv7()) @db.Uuid - abid String @unique @db.VarChar(30) - created_at DateTime @default(now()) @db.Timestamptz - modified_at DateTime @updatedAt @db.Timestamptz - created_by_id String @db.Uuid - seed_id String @db.Uuid - urls String @default("") @db.Text - config Json @default("{}") - max_depth Int @default(0) @db.SmallInt - tags_str String @default("") @db.VarChar(1024) - persona_id String? @db.Uuid - label String @default("") @db.VarChar(64) - notes String @default("") @db.Text - schedule_id String? @db.Uuid - status String @default("queued") @db.VarChar(16) - retry_at DateTime @default(now()) @db.Timestamptz - output_dir String @default("") @db.VarChar(255) - num_uses_failed Int @default(0) - num_uses_succeeded Int @default(0) - - // Relations - created_by User @relation(fields: [created_by_id], references: [id], onDelete: Cascade) - seed Seed @relation(fields: [seed_id], references: [id], onDelete: Restrict) - schedule CrawlSchedule? @relation("ScheduledCrawls", fields: [schedule_id], references: [id], onDelete: SetNull) - schedules_as_template CrawlSchedule[] @relation("CrawlScheduleTemplate") - snapshots Snapshot[] - outlinks Outlink[] - - @@index([created_at]) - @@index([created_by_id]) - @@index([seed_id]) - @@index([schedule_id]) - @@index([status]) - @@index([retry_at]) - @@map("crawls_crawl") -} - -// ============================================ -// Snapshot Model (archived URL) -// ============================================ -model Snapshot { - id String @id @default(uuidv7()) @db.Uuid - abid String @unique @db.VarChar(30) - created_at DateTime @default(now()) @db.Timestamptz - modified_at DateTime @updatedAt @db.Timestamptz - created_by_id String @db.Uuid - url String @unique @db.Text - timestamp String @unique @db.VarChar(32) - bookmarked_at DateTime @db.Timestamptz - crawl_id String? @db.Uuid - title String? @db.VarChar(512) - downloaded_at DateTime? @db.Timestamptz - retry_at DateTime @default(now()) @db.Timestamptz - status String @default("queued") @db.VarChar(16) - config Json @default("{}") - notes String @default("") @db.Text - output_dir String? @db.VarChar(255) - num_uses_failed Int @default(0) - num_uses_succeeded Int @default(0) - - // Relations - created_by User @relation(fields: [created_by_id], references: [id], onDelete: Cascade) - crawl Crawl? @relation(fields: [crawl_id], references: [id], onDelete: Cascade) - tags Tag[] @relation("SnapshotTags") - archive_results ArchiveResult[] - outlinks_via Outlink[] - - @@index([created_at]) - @@index([created_by_id]) - @@index([crawl_id]) - @@index([url]) - @@index([timestamp]) - @@index([bookmarked_at]) - @@index([downloaded_at]) - @@index([title]) - @@index([status]) - @@index([retry_at]) - @@map("core_snapshot") -} - -// ============================================ -// ArchiveResult Model (extraction result) -// ============================================ -model ArchiveResult { - id String @id @default(uuidv7()) @db.Uuid - abid String @unique @db.VarChar(30) - created_at DateTime @default(now()) @db.Timestamptz - modified_at DateTime @updatedAt @db.Timestamptz - created_by_id String @db.Uuid - snapshot_id String @db.Uuid - extractor String @db.VarChar(32) - pwd String? @db.VarChar(256) - cmd Json? - cmd_version String? @db.VarChar(128) - output String? @db.VarChar(1024) - start_ts DateTime? @db.Timestamptz - end_ts DateTime? @db.Timestamptz - status String @default("queued") @db.VarChar(16) - retry_at DateTime @default(now()) @db.Timestamptz - notes String @default("") @db.Text - output_dir String? @db.VarChar(256) - iface_id String? @db.Uuid - num_uses_failed Int @default(0) - num_uses_succeeded Int @default(0) - - // Relations - created_by User @relation(fields: [created_by_id], references: [id], onDelete: Cascade) - snapshot Snapshot @relation(fields: [snapshot_id], references: [id], onDelete: Cascade) - outlinks Outlink[] - - @@index([created_at]) - @@index([created_by_id]) - @@index([snapshot_id]) - @@index([extractor]) - @@index([status]) - @@index([retry_at]) - @@map("core_archiveresult") -} - -// ============================================ -// Outlink Model (link found on a page) -// ============================================ -model Outlink { - id String @id @default(uuidv7()) @db.Uuid - src String @db.Text - dst String @db.Text - crawl_id String @db.Uuid - via_id String? @db.Uuid - - // Relations - crawl Crawl @relation(fields: [crawl_id], references: [id], onDelete: Cascade) - via ArchiveResult? @relation(fields: [via_id], references: [id], onDelete: SetNull) - - @@unique([src, dst, via_id]) - @@map("crawls_outlink") -} diff --git a/orm-comparison/schema.typeorm.ts b/orm-comparison/schema.typeorm.ts deleted file mode 100644 index e5b74cea..00000000 --- a/orm-comparison/schema.typeorm.ts +++ /dev/null @@ -1,634 +0,0 @@ -// ArchiveBox Schema - TypeORM -// TypeORM uses TypeScript decorators on classes -// Line count: ~550 lines - -import { - Entity, - PrimaryColumn, - Column, - ManyToOne, - OneToMany, - ManyToMany, - JoinTable, - JoinColumn, - Index, - Unique, - CreateDateColumn, - UpdateDateColumn, - BeforeInsert, -} from 'typeorm'; -import { uuidv7 } from 'uuidv7'; - -// ============================================ -// User Entity (Django's default User) -// ============================================ -@Entity('auth_user') -@Index('auth_user_username_idx', ['username']) -export class User { - @PrimaryColumn('uuid') - id: string; - - @Column({ type: 'varchar', length: 150, unique: true }) - username: string; - - @Column({ type: 'varchar', length: 254 }) - email: string; - - @Column({ type: 'varchar', length: 128 }) - password: string; - - @Column({ type: 'varchar', length: 150 }) - first_name: string; - - @Column({ type: 'varchar', length: 150 }) - last_name: string; - - @Column({ type: 'boolean', default: true }) - is_active: boolean; - - @Column({ type: 'boolean', default: false }) - is_staff: boolean; - - @Column({ type: 'boolean', default: false }) - is_superuser: boolean; - - @CreateDateColumn({ type: 'timestamptz' }) - date_joined: Date; - - @Column({ type: 'timestamptz', nullable: true }) - last_login: Date | null; - - // Relations - @OneToMany(() => Tag, tag => tag.created_by) - tags: Tag[]; - - @OneToMany(() => KVTag, kvTag => kvTag.created_by) - kv_tags: KVTag[]; - - @OneToMany(() => Seed, seed => seed.created_by) - seeds: Seed[]; - - @OneToMany(() => Crawl, crawl => crawl.created_by) - crawls: Crawl[]; - - @OneToMany(() => CrawlSchedule, schedule => schedule.created_by) - crawl_schedules: CrawlSchedule[]; - - @OneToMany(() => Snapshot, snapshot => snapshot.created_by) - snapshots: Snapshot[]; - - @OneToMany(() => ArchiveResult, result => result.created_by) - archive_results: ArchiveResult[]; - - @BeforeInsert() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// Tag Entity (being phased out) -// ============================================ -@Entity('core_tag') -@Index('core_tag_created_at_idx', ['created_at']) -@Index('core_tag_created_by_idx', ['created_by_id']) -@Index('core_tag_abid_idx', ['abid']) -export class Tag { - @PrimaryColumn('uuid') - id: string; - - @Column({ type: 'varchar', length: 30, unique: true }) - abid: string; - - @CreateDateColumn({ type: 'timestamptz' }) - created_at: Date; - - @UpdateDateColumn({ type: 'timestamptz' }) - modified_at: Date; - - @Column({ type: 'uuid' }) - created_by_id: string; - - @Column({ type: 'varchar', length: 100, unique: true }) - name: string; - - @Column({ type: 'varchar', length: 100, unique: true }) - slug: string; - - // Relations - @ManyToOne(() => User, user => user.tags, { onDelete: 'CASCADE' }) - @JoinColumn({ name: 'created_by_id' }) - created_by: User; - - @ManyToMany(() => Snapshot, snapshot => snapshot.tags) - snapshots: Snapshot[]; - - @BeforeInsert() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// KVTag Entity (key-value tags) -// ============================================ -@Entity('core_kvtags') -@Unique(['obj_id', 'name']) -@Index('core_kvtags_created_at_idx', ['created_at']) -@Index('core_kvtags_obj_type_idx', ['obj_type']) -@Index('core_kvtags_obj_id_idx', ['obj_id']) -export class KVTag { - @PrimaryColumn('uuid') - id: string; - - @CreateDateColumn({ type: 'timestamptz' }) - created_at: Date; - - @Column({ type: 'varchar', length: 255 }) - name: string; - - @Column({ type: 'text', nullable: true }) - value: string | null; - - @Column({ type: 'varchar', length: 100 }) - obj_type: string; - - @Column({ type: 'uuid' }) - obj_id: string; - - @Column({ type: 'uuid' }) - created_by_id: string; - - // Relations - @ManyToOne(() => User, user => user.kv_tags, { onDelete: 'CASCADE' }) - @JoinColumn({ name: 'created_by_id' }) - created_by: User; - - @BeforeInsert() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// Seed Entity -// ============================================ -@Entity('crawls_seed') -@Unique(['created_by_id', 'uri', 'extractor']) -@Unique(['created_by_id', 'label']) -@Index('crawls_seed_created_at_idx', ['created_at']) -@Index('crawls_seed_created_by_idx', ['created_by_id']) -@Index('crawls_seed_abid_idx', ['abid']) -export class Seed { - @PrimaryColumn('uuid') - id: string; - - @Column({ type: 'varchar', length: 30, unique: true }) - abid: string; - - @CreateDateColumn({ type: 'timestamptz' }) - created_at: Date; - - @UpdateDateColumn({ type: 'timestamptz' }) - modified_at: Date; - - @Column({ type: 'uuid' }) - created_by_id: string; - - @Column({ type: 'text' }) - uri: string; - - @Column({ type: 'varchar', length: 32, default: 'auto' }) - extractor: string; - - @Column({ type: 'varchar', length: 255, default: '' }) - tags_str: string; - - @Column({ type: 'varchar', length: 255, default: '' }) - label: string; - - @Column({ type: 'jsonb', default: {} }) - config: object; - - @Column({ type: 'varchar', length: 255, default: '' }) - output_dir: string; - - @Column({ type: 'text', default: '' }) - notes: string; - - @Column({ type: 'int', default: 0 }) - num_uses_failed: number; - - @Column({ type: 'int', default: 0 }) - num_uses_succeeded: number; - - // Relations - @ManyToOne(() => User, user => user.seeds, { onDelete: 'CASCADE' }) - @JoinColumn({ name: 'created_by_id' }) - created_by: User; - - @OneToMany(() => Crawl, crawl => crawl.seed) - crawls: Crawl[]; - - @BeforeInsert() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// CrawlSchedule Entity -// ============================================ -@Entity('crawls_crawlschedule') -@Index('crawls_crawlschedule_created_at_idx', ['created_at']) -@Index('crawls_crawlschedule_created_by_idx', ['created_by_id']) -@Index('crawls_crawlschedule_template_idx', ['template_id']) -@Index('crawls_crawlschedule_abid_idx', ['abid']) -export class CrawlSchedule { - @PrimaryColumn('uuid') - id: string; - - @Column({ type: 'varchar', length: 30, unique: true }) - abid: string; - - @CreateDateColumn({ type: 'timestamptz' }) - created_at: Date; - - @UpdateDateColumn({ type: 'timestamptz' }) - modified_at: Date; - - @Column({ type: 'uuid' }) - created_by_id: string; - - @Column({ type: 'uuid' }) - template_id: string; - - @Column({ type: 'varchar', length: 64 }) - schedule: string; - - @Column({ type: 'boolean', default: true }) - is_enabled: boolean; - - @Column({ type: 'varchar', length: 64, default: '' }) - label: string; - - @Column({ type: 'text', default: '' }) - notes: string; - - @Column({ type: 'int', default: 0 }) - num_uses_failed: number; - - @Column({ type: 'int', default: 0 }) - num_uses_succeeded: number; - - // Relations - @ManyToOne(() => User, user => user.crawl_schedules, { onDelete: 'CASCADE' }) - @JoinColumn({ name: 'created_by_id' }) - created_by: User; - - @ManyToOne(() => Crawl, { onDelete: 'CASCADE' }) - @JoinColumn({ name: 'template_id' }) - template: Crawl; - - @OneToMany(() => Crawl, crawl => crawl.schedule) - crawls: Crawl[]; - - @BeforeInsert() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// Crawl Entity -// ============================================ -@Entity('crawls_crawl') -@Index('crawls_crawl_created_at_idx', ['created_at']) -@Index('crawls_crawl_created_by_idx', ['created_by_id']) -@Index('crawls_crawl_seed_idx', ['seed_id']) -@Index('crawls_crawl_schedule_idx', ['schedule_id']) -@Index('crawls_crawl_status_idx', ['status']) -@Index('crawls_crawl_retry_at_idx', ['retry_at']) -@Index('crawls_crawl_abid_idx', ['abid']) -export class Crawl { - @PrimaryColumn('uuid') - id: string; - - @Column({ type: 'varchar', length: 30, unique: true }) - abid: string; - - @CreateDateColumn({ type: 'timestamptz' }) - created_at: Date; - - @UpdateDateColumn({ type: 'timestamptz' }) - modified_at: Date; - - @Column({ type: 'uuid' }) - created_by_id: string; - - @Column({ type: 'uuid' }) - seed_id: string; - - @Column({ type: 'text', default: '' }) - urls: string; - - @Column({ type: 'jsonb', default: {} }) - config: object; - - @Column({ type: 'smallint', default: 0 }) - max_depth: number; - - @Column({ type: 'varchar', length: 1024, default: '' }) - tags_str: string; - - @Column({ type: 'uuid', nullable: true }) - persona_id: string | null; - - @Column({ type: 'varchar', length: 64, default: '' }) - label: string; - - @Column({ type: 'text', default: '' }) - notes: string; - - @Column({ type: 'uuid', nullable: true }) - schedule_id: string | null; - - @Column({ type: 'varchar', length: 16, default: 'queued' }) - status: string; - - @Column({ type: 'timestamptz', default: () => 'CURRENT_TIMESTAMP' }) - retry_at: Date; - - @Column({ type: 'varchar', length: 255, default: '' }) - output_dir: string; - - @Column({ type: 'int', default: 0 }) - num_uses_failed: number; - - @Column({ type: 'int', default: 0 }) - num_uses_succeeded: number; - - // Relations - @ManyToOne(() => User, user => user.crawls, { onDelete: 'CASCADE' }) - @JoinColumn({ name: 'created_by_id' }) - created_by: User; - - @ManyToOne(() => Seed, seed => seed.crawls, { onDelete: 'RESTRICT' }) - @JoinColumn({ name: 'seed_id' }) - seed: Seed; - - @ManyToOne(() => CrawlSchedule, schedule => schedule.crawls, { onDelete: 'SET NULL', nullable: true }) - @JoinColumn({ name: 'schedule_id' }) - schedule: CrawlSchedule | null; - - @OneToMany(() => Snapshot, snapshot => snapshot.crawl) - snapshots: Snapshot[]; - - @OneToMany(() => Outlink, outlink => outlink.crawl) - outlinks: Outlink[]; - - @BeforeInsert() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// Snapshot Entity -// ============================================ -@Entity('core_snapshot') -@Index('core_snapshot_created_at_idx', ['created_at']) -@Index('core_snapshot_created_by_idx', ['created_by_id']) -@Index('core_snapshot_crawl_idx', ['crawl_id']) -@Index('core_snapshot_url_idx', ['url']) -@Index('core_snapshot_timestamp_idx', ['timestamp']) -@Index('core_snapshot_bookmarked_at_idx', ['bookmarked_at']) -@Index('core_snapshot_downloaded_at_idx', ['downloaded_at']) -@Index('core_snapshot_title_idx', ['title']) -@Index('core_snapshot_status_idx', ['status']) -@Index('core_snapshot_retry_at_idx', ['retry_at']) -@Index('core_snapshot_abid_idx', ['abid']) -export class Snapshot { - @PrimaryColumn('uuid') - id: string; - - @Column({ type: 'varchar', length: 30, unique: true }) - abid: string; - - @CreateDateColumn({ type: 'timestamptz' }) - created_at: Date; - - @UpdateDateColumn({ type: 'timestamptz' }) - modified_at: Date; - - @Column({ type: 'uuid' }) - created_by_id: string; - - @Column({ type: 'text', unique: true }) - url: string; - - @Column({ type: 'varchar', length: 32, unique: true }) - timestamp: string; - - @Column({ type: 'timestamptz' }) - bookmarked_at: Date; - - @Column({ type: 'uuid', nullable: true }) - crawl_id: string | null; - - @Column({ type: 'varchar', length: 512, nullable: true }) - title: string | null; - - @Column({ type: 'timestamptz', nullable: true }) - downloaded_at: Date | null; - - @Column({ type: 'timestamptz', default: () => 'CURRENT_TIMESTAMP' }) - retry_at: Date; - - @Column({ type: 'varchar', length: 16, default: 'queued' }) - status: string; - - @Column({ type: 'jsonb', default: {} }) - config: object; - - @Column({ type: 'text', default: '' }) - notes: string; - - @Column({ type: 'varchar', length: 255, nullable: true }) - output_dir: string | null; - - @Column({ type: 'int', default: 0 }) - num_uses_failed: number; - - @Column({ type: 'int', default: 0 }) - num_uses_succeeded: number; - - // Relations - @ManyToOne(() => User, user => user.snapshots, { onDelete: 'CASCADE' }) - @JoinColumn({ name: 'created_by_id' }) - created_by: User; - - @ManyToOne(() => Crawl, crawl => crawl.snapshots, { onDelete: 'CASCADE', nullable: true }) - @JoinColumn({ name: 'crawl_id' }) - crawl: Crawl | null; - - @ManyToMany(() => Tag, tag => tag.snapshots) - @JoinTable({ - name: 'core_snapshot_tags', - joinColumn: { name: 'snapshot_id', referencedColumnName: 'id' }, - inverseJoinColumn: { name: 'tag_id', referencedColumnName: 'id' }, - }) - tags: Tag[]; - - @OneToMany(() => ArchiveResult, result => result.snapshot) - archive_results: ArchiveResult[]; - - @BeforeInsert() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// ArchiveResult Entity -// ============================================ -@Entity('core_archiveresult') -@Index('core_archiveresult_created_at_idx', ['created_at']) -@Index('core_archiveresult_created_by_idx', ['created_by_id']) -@Index('core_archiveresult_snapshot_idx', ['snapshot_id']) -@Index('core_archiveresult_extractor_idx', ['extractor']) -@Index('core_archiveresult_status_idx', ['status']) -@Index('core_archiveresult_retry_at_idx', ['retry_at']) -@Index('core_archiveresult_abid_idx', ['abid']) -export class ArchiveResult { - @PrimaryColumn('uuid') - id: string; - - @Column({ type: 'varchar', length: 30, unique: true }) - abid: string; - - @CreateDateColumn({ type: 'timestamptz' }) - created_at: Date; - - @UpdateDateColumn({ type: 'timestamptz' }) - modified_at: Date; - - @Column({ type: 'uuid' }) - created_by_id: string; - - @Column({ type: 'uuid' }) - snapshot_id: string; - - @Column({ type: 'varchar', length: 32 }) - extractor: string; - - @Column({ type: 'varchar', length: 256, nullable: true }) - pwd: string | null; - - @Column({ type: 'jsonb', nullable: true }) - cmd: object | null; - - @Column({ type: 'varchar', length: 128, nullable: true }) - cmd_version: string | null; - - @Column({ type: 'varchar', length: 1024, nullable: true }) - output: string | null; - - @Column({ type: 'timestamptz', nullable: true }) - start_ts: Date | null; - - @Column({ type: 'timestamptz', nullable: true }) - end_ts: Date | null; - - @Column({ type: 'varchar', length: 16, default: 'queued' }) - status: string; - - @Column({ type: 'timestamptz', default: () => 'CURRENT_TIMESTAMP' }) - retry_at: Date; - - @Column({ type: 'text', default: '' }) - notes: string; - - @Column({ type: 'varchar', length: 256, nullable: true }) - output_dir: string | null; - - @Column({ type: 'uuid', nullable: true }) - iface_id: string | null; - - @Column({ type: 'int', default: 0 }) - num_uses_failed: number; - - @Column({ type: 'int', default: 0 }) - num_uses_succeeded: number; - - // Relations - @ManyToOne(() => User, user => user.archive_results, { onDelete: 'CASCADE' }) - @JoinColumn({ name: 'created_by_id' }) - created_by: User; - - @ManyToOne(() => Snapshot, snapshot => snapshot.archive_results, { onDelete: 'CASCADE' }) - @JoinColumn({ name: 'snapshot_id' }) - snapshot: Snapshot; - - @OneToMany(() => Outlink, outlink => outlink.via) - outlinks: Outlink[]; - - @BeforeInsert() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -} - -// ============================================ -// Outlink Entity -// ============================================ -@Entity('crawls_outlink') -@Unique(['src', 'dst', 'via_id']) -export class Outlink { - @PrimaryColumn('uuid') - id: string; - - @Column({ type: 'text' }) - src: string; - - @Column({ type: 'text' }) - dst: string; - - @Column({ type: 'uuid' }) - crawl_id: string; - - @Column({ type: 'uuid', nullable: true }) - via_id: string | null; - - // Relations - @ManyToOne(() => Crawl, crawl => crawl.outlinks, { onDelete: 'CASCADE' }) - @JoinColumn({ name: 'crawl_id' }) - crawl: Crawl; - - @ManyToOne(() => ArchiveResult, result => result.outlinks, { onDelete: 'SET NULL', nullable: true }) - @JoinColumn({ name: 'via_id' }) - via: ArchiveResult | null; - - @BeforeInsert() - generateId() { - if (!this.id) { - this.id = uuidv7(); - } - } -}