Files
ArchiveBox/archivebox/crawls/migrations/0001_initial.py
2025-12-29 22:12:57 -08:00

142 lines
7.7 KiB
Python

# Generated by hand on 2025-12-29
# Creates Crawl and CrawlSchedule tables using raw SQL
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
import django.core.validators
from django.conf import settings
from archivebox.uuid_compat import uuid7
from archivebox.base_models.models import get_or_create_system_user_pk
class Migration(migrations.Migration):
initial = True
dependencies = [
('auth', '0012_alter_user_first_name_max_length'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunSQL(
sql="""
-- Create crawls_crawlschedule table first (circular FK will be added later)
CREATE TABLE IF NOT EXISTS crawls_crawlschedule (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
schedule VARCHAR(64) NOT NULL,
is_enabled BOOLEAN NOT NULL DEFAULT 1,
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
template_id TEXT NOT NULL,
created_by_id INTEGER NOT NULL,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS crawls_crawlschedule_created_at_idx ON crawls_crawlschedule(created_at);
CREATE INDEX IF NOT EXISTS crawls_crawlschedule_created_by_id_idx ON crawls_crawlschedule(created_by_id);
CREATE INDEX IF NOT EXISTS crawls_crawlschedule_template_id_idx ON crawls_crawlschedule(template_id);
-- Create crawls_crawl table
CREATE TABLE IF NOT EXISTS crawls_crawl (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
urls TEXT NOT NULL,
config TEXT,
max_depth INTEGER NOT NULL DEFAULT 0,
tags_str VARCHAR(1024) NOT NULL DEFAULT '',
persona_id TEXT,
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
output_dir VARCHAR(512) NOT NULL DEFAULT '',
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
created_by_id INTEGER NOT NULL,
schedule_id TEXT,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE,
FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL
);
CREATE INDEX IF NOT EXISTS crawls_crawl_status_idx ON crawls_crawl(status);
CREATE INDEX IF NOT EXISTS crawls_crawl_retry_at_idx ON crawls_crawl(retry_at);
CREATE INDEX IF NOT EXISTS crawls_crawl_created_at_idx ON crawls_crawl(created_at);
CREATE INDEX IF NOT EXISTS crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id);
CREATE INDEX IF NOT EXISTS crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id);
""",
reverse_sql="""
DROP TABLE IF EXISTS crawls_crawl;
DROP TABLE IF EXISTS crawls_crawlschedule;
"""
),
],
state_operations=[
migrations.CreateModel(
name='CrawlSchedule',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('schedule', models.CharField(max_length=64)),
('is_enabled', models.BooleanField(default=True)),
('label', models.CharField(blank=True, default='', max_length=64)),
('notes', models.TextField(blank=True, default='')),
('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
options={
'verbose_name': 'Scheduled Crawl',
'verbose_name_plural': 'Scheduled Crawls',
'app_label': 'crawls',
},
),
migrations.CreateModel(
name='Crawl',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('urls', models.TextField(help_text='Newline-separated list of URLs to crawl')),
('config', models.JSONField(blank=True, default=dict, null=True)),
('max_depth', models.PositiveSmallIntegerField(default=0, validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)])),
('tags_str', models.CharField(blank=True, default='', max_length=1024)),
('persona_id', models.UUIDField(blank=True, null=True)),
('label', models.CharField(blank=True, default='', max_length=64)),
('notes', models.TextField(blank=True, default='')),
('output_dir', models.CharField(blank=True, default='', max_length=512)),
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)),
('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
('schedule', models.ForeignKey(blank=True, editable=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='crawls.crawlschedule')),
],
options={
'verbose_name': 'Crawl',
'verbose_name_plural': 'Crawls',
'app_label': 'crawls',
},
),
migrations.AddField(
model_name='crawlschedule',
name='template',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'),
),
],
),
]