Files
ArchiveBox/archivebox/crawls/migrations/0001_initial.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

178 lines
8.8 KiB
Python

# Generated by hand on 2025-12-29
# Creates Crawl and CrawlSchedule tables using raw SQL
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
import django.core.validators
from django.conf import settings
from archivebox.uuid_compat import uuid7
from archivebox.base_models.models import get_or_create_system_user_pk
class Migration(migrations.Migration):
initial = True
dependencies = [
("auth", "0012_alter_user_first_name_max_length"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunSQL(
sql="""
-- Create crawls_crawlschedule table first (circular FK will be added later)
CREATE TABLE IF NOT EXISTS crawls_crawlschedule (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
schedule VARCHAR(64) NOT NULL,
is_enabled BOOLEAN NOT NULL DEFAULT 1,
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
template_id TEXT NOT NULL,
created_by_id INTEGER NOT NULL,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS crawls_crawlschedule_created_at_idx ON crawls_crawlschedule(created_at);
CREATE INDEX IF NOT EXISTS crawls_crawlschedule_created_by_id_idx ON crawls_crawlschedule(created_by_id);
CREATE INDEX IF NOT EXISTS crawls_crawlschedule_template_id_idx ON crawls_crawlschedule(template_id);
-- Create crawls_crawl table
CREATE TABLE IF NOT EXISTS crawls_crawl (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
urls TEXT NOT NULL,
config TEXT,
max_depth INTEGER NOT NULL DEFAULT 0,
tags_str VARCHAR(1024) NOT NULL DEFAULT '',
persona_id TEXT,
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
output_dir VARCHAR(512) NOT NULL DEFAULT '',
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
created_by_id INTEGER NOT NULL,
schedule_id TEXT,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE,
FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL
);
CREATE INDEX IF NOT EXISTS crawls_crawl_status_idx ON crawls_crawl(status);
CREATE INDEX IF NOT EXISTS crawls_crawl_retry_at_idx ON crawls_crawl(retry_at);
CREATE INDEX IF NOT EXISTS crawls_crawl_created_at_idx ON crawls_crawl(created_at);
CREATE INDEX IF NOT EXISTS crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id);
CREATE INDEX IF NOT EXISTS crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id);
""",
reverse_sql="""
DROP TABLE IF EXISTS crawls_crawl;
DROP TABLE IF EXISTS crawls_crawlschedule;
""",
),
],
state_operations=[
migrations.CreateModel(
name="CrawlSchedule",
fields=[
("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
("modified_at", models.DateTimeField(auto_now=True)),
("num_uses_succeeded", models.PositiveIntegerField(default=0)),
("num_uses_failed", models.PositiveIntegerField(default=0)),
("schedule", models.CharField(max_length=64)),
("is_enabled", models.BooleanField(default=True)),
("label", models.CharField(blank=True, default="", max_length=64)),
("notes", models.TextField(blank=True, default="")),
(
"created_by",
models.ForeignKey(
default=get_or_create_system_user_pk,
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
),
),
],
options={
"verbose_name": "Scheduled Crawl",
"verbose_name_plural": "Scheduled Crawls",
"app_label": "crawls",
},
),
migrations.CreateModel(
name="Crawl",
fields=[
("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
("modified_at", models.DateTimeField(auto_now=True)),
("num_uses_succeeded", models.PositiveIntegerField(default=0)),
("num_uses_failed", models.PositiveIntegerField(default=0)),
("urls", models.TextField(help_text="Newline-separated list of URLs to crawl")),
("config", models.JSONField(blank=True, default=dict, null=True)),
(
"max_depth",
models.PositiveSmallIntegerField(
default=0,
validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)],
),
),
("tags_str", models.CharField(blank=True, default="", max_length=1024)),
("persona_id", models.UUIDField(blank=True, null=True)),
("label", models.CharField(blank=True, default="", max_length=64)),
("notes", models.TextField(blank=True, default="")),
("output_dir", models.CharField(blank=True, default="", max_length=512)),
(
"status",
models.CharField(
choices=[("queued", "Queued"), ("started", "Started"), ("sealed", "Sealed")],
db_index=True,
default="queued",
max_length=15,
),
),
("retry_at", models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)),
(
"created_by",
models.ForeignKey(
default=get_or_create_system_user_pk,
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
),
),
(
"schedule",
models.ForeignKey(
blank=True,
editable=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to="crawls.crawlschedule",
),
),
],
options={
"verbose_name": "Crawl",
"verbose_name_plural": "Crawls",
"app_label": "crawls",
},
),
migrations.AddField(
model_name="crawlschedule",
name="template",
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="crawls.crawl"),
),
],
),
]