diff --git a/archivebox/__init__.py b/archivebox/__init__.py
index 8c65a60f..066c2ee7 100755
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -85,7 +85,6 @@ ARCHIVEBOX_BUILTIN_PLUGINS = {
'workers': PACKAGE_DIR / 'workers',
'core': PACKAGE_DIR / 'core',
'crawls': PACKAGE_DIR / 'crawls',
- 'seeds': PACKAGE_DIR / 'seeds',
# 'search': PACKAGE_DIR / 'search',
# 'core': PACKAGE_DIR / 'core',
}
diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py
index 97e95a6a..2c8ac63d 100644
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@@ -10,8 +10,7 @@ from django.contrib.auth import get_user_model
from ninja import Router, Schema
from core.models import Snapshot
-from crawls.models import Crawl
-from seeds.models import Seed
+from crawls.models import Seed, Crawl
from .auth import API_AUTH_METHODS
@@ -19,7 +18,7 @@ router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
class SeedSchema(Schema):
- TYPE: str = 'seeds.models.Seed'
+ TYPE: str = 'crawls.models.Seed'
id: UUID
abid: str
@@ -60,7 +59,7 @@ def get_seed(request, seed_id: str):
class CrawlSchema(Schema):
- TYPE: str = 'core.models.Crawl'
+ TYPE: str = 'crawls.models.Crawl'
id: UUID
abid: str
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index db0bb305..c90ed323 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -51,8 +51,7 @@ def add(urls: str | list[str],
setup_django()
check_data_folder()
- from seeds.models import Seed
- from crawls.models import Crawl
+ from crawls.models import Seed, Crawl
from workers.orchestrator import Orchestrator
from abid_utils.models import get_or_create_system_user_pk
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 43853df2..22b0d9a4 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -65,8 +65,7 @@ INSTALLED_APPS = [
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
'workers', # handles starting and managing background workers and processes (orchestrators and actors)
- 'seeds', # handles Seed model and URL source management
- 'crawls', # handles Crawl and CrawlSchedule models and management
+ 'crawls', # handles Seed, Crawl, and CrawlSchedule models and management
'personas', # handles Persona and session management
'core', # core django model with Snapshot, ArchiveResult, etc.
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py
index c08cfbde..34221fa6 100644
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -1,7 +1,5 @@
__package__ = 'archivebox.crawls'
-import abx
-
from django.utils.html import format_html, format_html_join
from django.contrib import admin
@@ -10,7 +8,59 @@ from archivebox import DATA_DIR
from abid_utils.admin import ABIDModelAdmin
from core.models import Snapshot
-from crawls.models import Crawl, CrawlSchedule
+from crawls.models import Seed, Crawl, CrawlSchedule
+
+
+class SeedAdmin(ABIDModelAdmin):
+ list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
+ sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
+ search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
+
+ readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
+ fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
+
+ list_filter = ('extractor', 'created_by')
+ ordering = ['-created_at']
+ list_per_page = 100
+ actions = ["delete_selected"]
+
+ def num_crawls(self, obj):
+ return obj.crawl_set.count()
+
+ def num_snapshots(self, obj):
+ return obj.snapshot_set.count()
+
+ def scheduled_crawls(self, obj):
+ return format_html_join('
', ' - {}', (
+ (scheduledcrawl.admin_change_url, scheduledcrawl)
+ for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
+ )) or format_html('No Scheduled Crawls yet...')
+
+ def crawls(self, obj):
+ return format_html_join('
', ' - {}', (
+ (crawl.admin_change_url, crawl)
+ for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
+ )) or format_html('No Crawls yet...')
+
+ def snapshots(self, obj):
+ return format_html_join('
', ' - {}', (
+ (snapshot.admin_change_url, snapshot)
+ for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
+ )) or format_html('No Snapshots yet...')
+
+ def contents(self, obj):
+ if obj.uri.startswith('file:///data/'):
+ source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
+ contents = ""
+ try:
+ contents = source_file.read_text().strip()[:14_000]
+ except Exception as e:
+ contents = f'Error reading {source_file}: {e}'
+
+ return format_html('{}:
{}', source_file, contents)
+
+ return format_html('See URLs here: {}', obj.uri, obj.uri)
+
@@ -102,7 +152,8 @@ class CrawlScheduleAdmin(ABIDModelAdmin):
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
)) or format_html('No Snapshots yet...')
-@abx.hookimpl
+
def register_admin(admin_site):
+ admin_site.register(Seed, SeedAdmin)
admin_site.register(Crawl, CrawlAdmin)
admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index e0f8a299..d37908af 100644
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -1,6 +1,7 @@
__package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING
+from pathlib import Path
from django_stubs_ext.db.models import TypedModelMeta
from django.db import models
@@ -12,12 +13,114 @@ from django.utils import timezone
from workers.models import ModelWithStateMachine
+from archivebox.config import CONSTANTS
+from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
+
if TYPE_CHECKING:
from core.models import Snapshot, ArchiveResult
-from seeds.models import Seed
-from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
+
+class Seed(ABIDModel, ModelWithHealthStats):
+ """
+ A fountain that produces URLs (+metadata) each time it's queried e.g.
+ - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
+ - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
+ - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
+ - https://getpocket.com/user/nikisweeting/feed
+ - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
+ - ...
+ Each query of a Seed can produce the same list of URLs, or a different list each time.
+ The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
+
+ When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
+ The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
+ The outlinks then get turned into new pending Snapshots under the same crawl,
+ and the cycle repeats until Crawl.max_depth.
+
+ Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
+ stateful remote services, files with contents that change, directories that have new files within, etc.
+ """
+
+ abid_prefix = 'src_'
+ abid_ts_src = 'self.created_at'
+ abid_uri_src = 'self.uri'
+ abid_subtype_src = 'self.extractor'
+ abid_rand_src = 'self.id'
+ abid_drift_allowed = True
+
+ id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
+ abid = ABIDField(prefix=abid_prefix)
+
+ uri = models.URLField(max_length=2000, blank=False, null=False) # unique source location where URLs will be loaded from
+ label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
+ notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
+
+ extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
+ tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
+ config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
+
+ created_at = AutoDateTimeField(default=None, null=False, db_index=True)
+ modified_at = models.DateTimeField(auto_now=True)
+ created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
+
+
+ crawl_set: models.Manager['Crawl']
+
+ class Meta:
+ verbose_name = 'Seed'
+ verbose_name_plural = 'Seeds'
+
+ unique_together = (('created_by', 'uri', 'extractor'),)
+
+
+ @classmethod
+ def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
+ source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
+
+ seed, _ = cls.objects.get_or_create(
+ label=label or source_file.name,
+ uri=f'file://{source_path}',
+ created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
+ extractor=parser,
+ tags_str=tag,
+ config=config or {},
+ )
+ seed.save()
+ return seed
+
+ @property
+ def source_type(self):
+ # e.g. http/https://
+ # file://
+ # pocketapi://
+ # s3://
+ # etc..
+ return self.uri.split('://', 1)[0].lower()
+
+ @property
+ def api_url(self) -> str:
+ # /api/v1/core/seed/{uulid}
+ return reverse_lazy('api-1:get_seed', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
+
+ @property
+ def api_docs_url(self) -> str:
+ return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
+
+ @property
+ def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
+ from crawls.models import CrawlSchedule
+ return CrawlSchedule.objects.filter(template__seed_id=self.pk)
+
+ @property
+ def snapshot_set(self) -> QuerySet['Snapshot']:
+ from core.models import Snapshot
+
+ crawl_ids = self.crawl_set.values_list('pk', flat=True)
+ return Snapshot.objects.filter(crawl_id__in=crawl_ids)
+
+
+
class CrawlSchedule(ABIDModel, ModelWithHealthStats):
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index 921c074f..ea059db1 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -34,7 +34,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
return []
-# This should be abstracted by a plugin interface for extractors
+# TODO: This should be abstracted by a plugin interface for extractors
@enforce_types
def get_indexable_content(results: QuerySet):
if not results:
diff --git a/archivebox/search/admin.py b/archivebox/search/admin.py
index 42aadf6f..0f7bcc8c 100644
--- a/archivebox/search/admin.py
+++ b/archivebox/search/admin.py
@@ -1,10 +1,11 @@
__package__ = 'archivebox.search'
from django.contrib import messages
+from django.contrib import admin
from archivebox.search import query_search_index
-class SearchResultsAdminMixin:
+class SearchResultsAdminMixin(admin.ModelAdmin):
def get_search_results(self, request, queryset, search_term: str):
"""Enhances the search queryset with results from the search backend"""
diff --git a/archivebox/seeds/__init__.py b/archivebox/seeds/__init__.py
deleted file mode 100644
index 7c3cd823..00000000
--- a/archivebox/seeds/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-
-__package__ = 'archivebox.seeds'
-__order__ = 100
-
-import abx
-
-
-@abx.hookimpl
-def register_admin(admin_site):
- from .admin import register_admin as register_seeds_admin
- register_seeds_admin(admin_site)
-
diff --git a/archivebox/seeds/admin.py b/archivebox/seeds/admin.py
deleted file mode 100644
index 84f76c46..00000000
--- a/archivebox/seeds/admin.py
+++ /dev/null
@@ -1,68 +0,0 @@
-__package__ = 'archivebox.seeds'
-
-import abx
-
-from django.utils.html import format_html_join, format_html
-
-from abid_utils.admin import ABIDModelAdmin
-
-from archivebox import DATA_DIR
-
-from seeds.models import Seed
-
-
-
-class SeedAdmin(ABIDModelAdmin):
- list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
- sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
- search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
-
- readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
- fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
-
- list_filter = ('extractor', 'created_by')
- ordering = ['-created_at']
- list_per_page = 100
- actions = ["delete_selected"]
-
- def num_crawls(self, obj):
- return obj.crawl_set.count()
-
- def num_snapshots(self, obj):
- return obj.snapshot_set.count()
-
- def scheduled_crawls(self, obj):
- return format_html_join('{}:{}', source_file, contents)
-
- return format_html('See URLs here: {}', obj.uri, obj.uri)
-
-
-@abx.hookimpl
-def register_admin(admin_site):
- admin_site.register(Seed, SeedAdmin)
diff --git a/archivebox/seeds/apps.py b/archivebox/seeds/apps.py
deleted file mode 100644
index 38eb4fde..00000000
--- a/archivebox/seeds/apps.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from django.apps import AppConfig
-
-
-class SeedsConfig(AppConfig):
- default_auto_field = "django.db.models.BigAutoField"
- name = "seeds"
diff --git a/archivebox/seeds/migrations/__init__.py b/archivebox/seeds/migrations/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/archivebox/seeds/models.py b/archivebox/seeds/models.py
deleted file mode 100644
index ce96c913..00000000
--- a/archivebox/seeds/models.py
+++ /dev/null
@@ -1,115 +0,0 @@
-__package__ = 'archivebox.seeds'
-
-from typing import TYPE_CHECKING
-from pathlib import Path
-
-from django.db import models
-from django.db.models import QuerySet
-from django.conf import settings
-from django.urls import reverse_lazy
-
-from archivebox.config import CONSTANTS
-from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
-
-if TYPE_CHECKING:
- from crawls.models import Crawl, CrawlSchedule
- from core.models import Snapshot
-
-
-class Seed(ABIDModel, ModelWithHealthStats):
- """
- A fountain that produces URLs (+metadata) each time it's queried e.g.
- - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
- - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
- - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
- - https://getpocket.com/user/nikisweeting/feed
- - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
- - ...
- Each query of a Seed can produce the same list of URLs, or a different list each time.
- The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
-
- When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
- The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
- The outlinks then get turned into new pending Snapshots under the same crawl,
- and the cycle repeats until Crawl.max_depth.
-
- Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
- stateful remote services, files with contents that change, directories that have new files within, etc.
- """
-
- abid_prefix = 'src_'
- abid_ts_src = 'self.created_at'
- abid_uri_src = 'self.uri'
- abid_subtype_src = 'self.extractor'
- abid_rand_src = 'self.id'
- abid_drift_allowed = True
-
- id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
- abid = ABIDField(prefix=abid_prefix)
-
- uri = models.URLField(max_length=2000, blank=False, null=False) # unique source location where URLs will be loaded from
- label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
- notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
-
- extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
- tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
- config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
-
- created_at = AutoDateTimeField(default=None, null=False, db_index=True)
- modified_at = models.DateTimeField(auto_now=True)
- created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
-
-
- crawl_set: models.Manager['Crawl']
-
- class Meta:
- verbose_name = 'Seed'
- verbose_name_plural = 'Seeds'
-
- unique_together = (('created_by', 'uri', 'extractor'),)
-
-
- @classmethod
- def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
- source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
-
- seed, _ = cls.objects.get_or_create(
- label=label or source_file.name,
- uri=f'file://{source_path}',
- created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
- extractor=parser,
- tags_str=tag,
- config=config or {},
- )
- seed.save()
- return seed
-
- @property
- def source_type(self):
- # e.g. http/https://
- # file://
- # pocketapi://
- # s3://
- # etc..
- return self.uri.split('://', 1)[0].lower()
-
- @property
- def api_url(self) -> str:
- # /api/v1/core/seed/{uulid}
- return reverse_lazy('api-1:get_seed', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
-
- @property
- def api_docs_url(self) -> str:
- return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
-
- @property
- def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
- from crawls.models import CrawlSchedule
- return CrawlSchedule.objects.filter(template__seed_id=self.pk)
-
- @property
- def snapshot_set(self) -> QuerySet['Snapshot']:
- from core.models import Snapshot
-
- crawl_ids = self.crawl_set.values_list('pk', flat=True)
- return Snapshot.objects.filter(crawl_id__in=crawl_ids)
diff --git a/archivebox/seeds/tests.py b/archivebox/seeds/tests.py
deleted file mode 100644
index 7ce503c2..00000000
--- a/archivebox/seeds/tests.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.test import TestCase
-
-# Create your tests here.
diff --git a/archivebox/seeds/views.py b/archivebox/seeds/views.py
deleted file mode 100644
index 91ea44a2..00000000
--- a/archivebox/seeds/views.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.shortcuts import render
-
-# Create your views here.