Remove extractor field from Crawl model and fix tests

- Remove extractor field from Crawl model (moved to config dict)
- Update migration 0002_drop_seed_model to not add extractor
- Update archivebox_add.py to use config['PARSER'] instead
- Update admin.py recrawl to not pass extractor
- Update jsonl.py serialization to not include extractor
- Update test schema SCHEMA_0_8 to not include extractor
- Set default timeout to 60s for test commands
This commit is contained in:
Claude
2025-12-27 01:49:09 +00:00
parent ae2ab5b273
commit c3acadd528
6 changed files with 63 additions and 119 deletions

View File

@@ -233,7 +233,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
new_crawl = Crawl.objects.create(
urls=obj.urls,
extractor=obj.extractor,
max_depth=obj.max_depth,
tags_str=obj.tags_str,
config=obj.config,

View File

@@ -20,11 +20,6 @@ class Migration(migrations.Migration):
model_name='crawl',
name='seed',
),
migrations.AddField(
model_name='crawl',
name='extractor',
field=models.CharField(default='auto', help_text='Parser for reading URLs (auto, html, json, rss, etc)', max_length=32),
),
migrations.AlterField(
model_name='crawl',
name='created_by',

View File

@@ -61,7 +61,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
modified_at = models.DateTimeField(auto_now=True)
urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
extractor = models.CharField(default='auto', max_length=32, help_text='Parser for reading URLs (auto, html, json, rss, etc)')
config = models.JSONField(default=dict)
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')