""" Tests for archivebox crawl CLI command. Tests cover: - crawl create (with URLs, from stdin, pass-through) - crawl list (with filters) - crawl update - crawl delete """ import json from archivebox.tests.conftest import ( run_archivebox_cmd, parse_jsonl_output, create_test_url, ) class TestCrawlCreate: """Tests for `archivebox crawl create`.""" def test_create_from_url_args(self, initialized_archive): """Create crawl from URL arguments.""" url = create_test_url() stdout, stderr, code = run_archivebox_cmd( ["crawl", "create", url], data_dir=initialized_archive, ) assert code == 0, f"Command failed: {stderr}" assert "Created crawl" in stderr # Check JSONL output records = parse_jsonl_output(stdout) assert len(records) == 1 assert records[0]["type"] == "Crawl" assert url in records[0]["urls"] def test_create_from_stdin_urls(self, initialized_archive): """Create crawl from stdin URLs (one per line).""" urls = [create_test_url() for _ in range(3)] stdin = "\n".join(urls) stdout, stderr, code = run_archivebox_cmd( ["crawl", "create"], stdin=stdin, data_dir=initialized_archive, ) assert code == 0, f"Command failed: {stderr}" records = parse_jsonl_output(stdout) assert len(records) == 1 crawl = records[0] assert crawl["type"] == "Crawl" # All URLs should be in the crawl for url in urls: assert url in crawl["urls"] def test_create_with_depth(self, initialized_archive): """Create crawl with --depth flag.""" url = create_test_url() stdout, stderr, code = run_archivebox_cmd( ["crawl", "create", "--depth=2", url], data_dir=initialized_archive, ) assert code == 0 records = parse_jsonl_output(stdout) assert records[0]["max_depth"] == 2 def test_create_with_tag(self, initialized_archive): """Create crawl with --tag flag.""" url = create_test_url() stdout, stderr, code = run_archivebox_cmd( ["crawl", "create", "--tag=test-tag", url], data_dir=initialized_archive, ) assert code == 0 records = parse_jsonl_output(stdout) assert "test-tag" in records[0].get("tags_str", "") def test_create_pass_through_other_types(self, initialized_archive): """Pass-through records of other types unchanged.""" tag_record = {"type": "Tag", "id": "fake-tag-id", "name": "test"} url = create_test_url() stdin = json.dumps(tag_record) + "\n" + json.dumps({"url": url}) stdout, stderr, code = run_archivebox_cmd( ["crawl", "create"], stdin=stdin, data_dir=initialized_archive, ) assert code == 0 records = parse_jsonl_output(stdout) # Should have both the passed-through Tag and the new Crawl types = [r.get("type") for r in records] assert "Tag" in types assert "Crawl" in types def test_create_pass_through_existing_crawl(self, initialized_archive): """Existing Crawl records (with id) are passed through.""" # First create a crawl url = create_test_url() stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) crawl = parse_jsonl_output(stdout1)[0] # Now pipe it back - should pass through stdout2, stderr, code = run_archivebox_cmd( ["crawl", "create"], stdin=json.dumps(crawl), data_dir=initialized_archive, ) assert code == 0 records = parse_jsonl_output(stdout2) assert len(records) == 1 assert records[0]["id"] == crawl["id"] class TestCrawlList: """Tests for `archivebox crawl list`.""" def test_list_empty(self, initialized_archive): """List with no crawls returns empty.""" stdout, stderr, code = run_archivebox_cmd( ["crawl", "list"], data_dir=initialized_archive, ) assert code == 0 assert "Listed 0 crawls" in stderr def test_list_returns_created(self, initialized_archive): """List returns previously created crawls.""" url = create_test_url() run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) stdout, stderr, code = run_archivebox_cmd( ["crawl", "list"], data_dir=initialized_archive, ) assert code == 0 records = parse_jsonl_output(stdout) assert len(records) >= 1 assert any(url in r.get("urls", "") for r in records) def test_list_filter_by_status(self, initialized_archive): """Filter crawls by status.""" url = create_test_url() run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) stdout, stderr, code = run_archivebox_cmd( ["crawl", "list", "--status=queued"], data_dir=initialized_archive, ) assert code == 0 records = parse_jsonl_output(stdout) for r in records: assert r["status"] == "queued" def test_list_with_limit(self, initialized_archive): """Limit number of results.""" # Create multiple crawls for _ in range(3): run_archivebox_cmd(["crawl", "create", create_test_url()], data_dir=initialized_archive) stdout, stderr, code = run_archivebox_cmd( ["crawl", "list", "--limit=2"], data_dir=initialized_archive, ) assert code == 0 records = parse_jsonl_output(stdout) assert len(records) == 2 class TestCrawlUpdate: """Tests for `archivebox crawl update`.""" def test_update_status(self, initialized_archive): """Update crawl status.""" # Create a crawl url = create_test_url() stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) crawl = parse_jsonl_output(stdout1)[0] # Update it stdout2, stderr, code = run_archivebox_cmd( ["crawl", "update", "--status=started"], stdin=json.dumps(crawl), data_dir=initialized_archive, ) assert code == 0 assert "Updated 1 crawls" in stderr records = parse_jsonl_output(stdout2) assert records[0]["status"] == "started" class TestCrawlDelete: """Tests for `archivebox crawl delete`.""" def test_delete_requires_yes(self, initialized_archive): """Delete requires --yes flag.""" url = create_test_url() stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) crawl = parse_jsonl_output(stdout1)[0] stdout, stderr, code = run_archivebox_cmd( ["crawl", "delete"], stdin=json.dumps(crawl), data_dir=initialized_archive, ) assert code == 1 assert "--yes" in stderr def test_delete_with_yes(self, initialized_archive): """Delete with --yes flag works.""" url = create_test_url() stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) crawl = parse_jsonl_output(stdout1)[0] stdout, stderr, code = run_archivebox_cmd( ["crawl", "delete", "--yes"], stdin=json.dumps(crawl), data_dir=initialized_archive, ) assert code == 0 assert "Deleted 1 crawls" in stderr def test_delete_dry_run(self, initialized_archive): """Dry run shows what would be deleted.""" url = create_test_url() stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) crawl = parse_jsonl_output(stdout1)[0] stdout, stderr, code = run_archivebox_cmd( ["crawl", "delete", "--dry-run"], stdin=json.dumps(crawl), data_dir=initialized_archive, ) assert code == 0 assert "Would delete" in stderr assert "dry run" in stderr.lower()