import json import subprocess from .fixtures import * def test_search_json(process, disable_extractors_dict): subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"], capture_output=True, env=disable_extractors_dict) search_process = subprocess.run(["archivebox", "search", "--json"], capture_output=True) output_str = search_process.stdout.decode("utf-8").strip() # Handle potential control characters in output try: output_json = json.loads(output_str) except json.JSONDecodeError: # Try with strict=False if there are control characters import re # Remove ANSI escape sequences and control characters clean_str = re.sub(r'\x1b\[[0-9;]*m', '', output_str) clean_str = re.sub(r'[\x00-\x1f\x7f]', lambda m: ' ' if m.group(0) in '\t\n\r' else '', clean_str) output_json = json.loads(clean_str) # With --index-only, only source file snapshots are created (file:// URLs) # Verify we get at least one snapshot back assert len(output_json) >= 1 # The snapshot should be a file:// URL pointing to sources assert any("sources" in entry.get("url", "") for entry in output_json) def test_search_json_headers(process, disable_extractors_dict): subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"], capture_output=True, env=disable_extractors_dict) search_process = subprocess.run(["archivebox", "search", "--json", "--with-headers"], capture_output=True) output_str = search_process.stdout.decode("utf-8").strip() # Handle potential control characters in output try: output_json = json.loads(output_str) except json.JSONDecodeError: # Try with strict=False if there are control characters import re # Remove ANSI escape sequences and control characters clean_str = re.sub(r'\x1b\[[0-9;]*m', '', output_str) clean_str = re.sub(r'[\x00-\x1f\x7f]', lambda m: ' ' if m.group(0) in '\t\n\r' else '', clean_str) output_json = json.loads(clean_str) # The response should have a links key with headers mode links = output_json.get("links", output_json) assert len(links) >= 1 def test_search_html(process, disable_extractors_dict): subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"], capture_output=True, env=disable_extractors_dict) search_process = subprocess.run(["archivebox", "search", "--html"], capture_output=True) output_html = search_process.stdout.decode("utf-8") # Should contain some HTML and reference to the source file assert "sources" in output_html or "cli_add" in output_html or "<" in output_html def test_search_html_headers(process, disable_extractors_dict): subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"], capture_output=True, env=disable_extractors_dict) search_process = subprocess.run(["archivebox", "search", "--html", "--with-headers"], capture_output=True) output_html = search_process.stdout.decode("utf-8") # Should contain HTML assert "<" in output_html def test_search_csv(process, disable_extractors_dict): subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"], capture_output=True, env=disable_extractors_dict) search_process = subprocess.run(["archivebox", "search", "--csv", "url"], capture_output=True) output_csv = search_process.stdout.decode("utf-8") # Should contain the source file URL assert "file://" in output_csv or "sources" in output_csv def test_search_csv_headers(process, disable_extractors_dict): subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"], capture_output=True, env=disable_extractors_dict) search_process = subprocess.run(["archivebox", "search", "--csv", "url", "--with-headers"], capture_output=True) output_csv = search_process.stdout.decode("utf-8") # Should have url header and source file content assert "url" in output_csv def test_search_with_headers_requires_format(process): search_process = subprocess.run(["archivebox", "search", "--with-headers"], capture_output=True) stderr = search_process.stderr.decode("utf-8") assert "--with-headers" in stderr and ("requires" in stderr or "can only be used" in stderr) def test_sort_by_url(process, disable_extractors_dict): # Add two URLs - they will create separate source files subprocess.run(["archivebox", "add", "--index-only", "https://iana.org", "--depth=0"], capture_output=True, env=disable_extractors_dict) subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"], capture_output=True, env=disable_extractors_dict) # Search with sort should return results (even if they're file:// URLs) search_process = subprocess.run(["archivebox", "search", "--csv", "url", "--sort=url"], capture_output=True) output = search_process.stdout.decode("utf-8") lines = [line for line in output.strip().split("\n") if line] # Should have at least 2 snapshots (the source file snapshots) assert len(lines) >= 2