diff --git a/archivebox/api/tests.py b/archivebox/api/tests.py index f04fa248..785051b4 100644 --- a/archivebox/api/tests.py +++ b/archivebox/api/tests.py @@ -25,8 +25,8 @@ class CLIScheduleAPITests(TestCase): def test_schedule_api_creates_schedule(self): request = RequestFactory().post('/api/v1/cli/schedule') request.user = self.user - request.stdout = StringIO() - request.stderr = StringIO() + setattr(request, 'stdout', StringIO()) + setattr(request, 'stderr', StringIO()) args = ScheduleCommandSchema( every='daily', import_path='https://example.com/feed.xml', diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py index 39a7b974..1cae7231 100644 --- a/archivebox/api/v1_cli.py +++ b/archivebox/api/v1_cli.py @@ -112,7 +112,7 @@ class RemoveCommandSchema(Schema): def cli_add(request: HttpRequest, args: AddCommandSchema): from archivebox.cli.archivebox_add import add - result = add( + crawl, snapshots = add( urls=args.urls, tag=args.tag, depth=args.depth, @@ -125,9 +125,9 @@ def cli_add(request: HttpRequest, args: AddCommandSchema): created_by_id=request.user.pk, ) - snapshot_ids = [str(snapshot_id) for snapshot_id in result.values_list('id', flat=True)] + snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list('id', flat=True)] result_payload = { - "crawl_id": getattr(result, "crawl_id", None), + "crawl_id": str(crawl.id), "num_snapshots": len(snapshot_ids), "snapshot_ids": snapshot_ids, "queued_urls": args.urls, diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index aa0cba8f..062eba8b 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -427,7 +427,7 @@ def get_any(request: HttpRequest, id: str): try: response = getter(request, id) if isinstance(response, Model): - return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.id}?{request.META['QUERY_STRING']}") + return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}") except Exception: pass @@ -435,7 +435,7 @@ def get_any(request: HttpRequest, id: str): from archivebox.api.v1_crawls import get_crawl response = get_crawl(request, id) if isinstance(response, Model): - return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.id}?{request.META['QUERY_STRING']}") + return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}") except Exception: pass diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index ef345d8b..cdfc616c 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -21,6 +21,7 @@ from archivebox.config.permissions import USER, HOSTNAME if TYPE_CHECKING: from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl def _collect_input_urls(args: tuple[str, ...]) -> list[str]: @@ -53,7 +54,7 @@ def add(urls: str | list[str], update: bool=not ARCHIVING_CONFIG.ONLY_NEW, index_only: bool=False, bg: bool=False, - created_by_id: int | None=None) -> QuerySet['Snapshot']: + created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]: """Add a new URL or list of URLs to your archive. The flow is: @@ -145,7 +146,7 @@ def add(urls: str | list[str], if tag: snapshot.save_tags(tag.split(',')) snapshot.ensure_crawl_symlink() - return crawl.snapshot_set.all() + return crawl, crawl.snapshot_set.all() # 5. Start the orchestrator to process the queue # The orchestrator will: @@ -210,8 +211,7 @@ def add(urls: str | list[str], # 6. Return the list of Snapshots in this crawl snapshots = crawl.snapshot_set.all() - snapshots.crawl_id = str(crawl.id) - return snapshots + return crawl, snapshots @click.command() diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py index aaade4e8..5a2327dd 100644 --- a/archivebox/misc/jsonl.py +++ b/archivebox/misc/jsonl.py @@ -85,21 +85,21 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]: Yields parsed records as dicts. Supports both JSONL format and plain URLs (one per line). """ - stream = stream or sys.stdin + active_stream: TextIO = sys.stdin if stream is None else stream # Don't block if stdin is a tty with no input - if stream.isatty(): + if active_stream.isatty(): return try: - ready, _, _ = select.select([stream], [], [], 0) + ready, _, _ = select.select([active_stream], [], [], 0) except (OSError, ValueError): - ready = [stream] + ready = [active_stream] if not ready: return - for line in stream: + for line in active_stream: record = parse_line(line) if record: yield record @@ -142,9 +142,9 @@ def write_record(record: Dict[str, Any], stream: Optional[TextIO] = None) -> Non """ Write a single JSONL record to stdout (or provided stream). """ - stream = stream or sys.stdout - stream.write(json.dumps(record) + '\n') - stream.flush() + active_stream: TextIO = sys.stdout if stream is None else stream + active_stream.write(json.dumps(record) + '\n') + active_stream.flush() def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = None) -> int: diff --git a/archivebox/workers/tasks.py b/archivebox/workers/tasks.py index aec5c2a3..9eeb141b 100644 --- a/archivebox/workers/tasks.py +++ b/archivebox/workers/tasks.py @@ -27,7 +27,7 @@ def bg_add(add_kwargs: dict) -> int: add_kwargs = add_kwargs.copy() add_kwargs['bg'] = True - result = add(**add_kwargs) + _, result = add(**add_kwargs) return len(result) if result else 0