mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip
This commit is contained in:
@@ -117,20 +117,28 @@ def main(url: str, snapshot_id: str = None):
|
||||
if cleaned_url != url:
|
||||
urls_found.add(cleaned_url)
|
||||
|
||||
if not urls_found:
|
||||
click.echo('No URLs found', err=True)
|
||||
sys.exit(1)
|
||||
# Emit Snapshot records to stdout (JSONL)
|
||||
for found_url in sorted(urls_found):
|
||||
record = {
|
||||
'type': 'Snapshot',
|
||||
'url': found_url,
|
||||
'plugin': PLUGIN_NAME,
|
||||
}
|
||||
if snapshot_id:
|
||||
record['parent_snapshot_id'] = snapshot_id
|
||||
print(json.dumps(record))
|
||||
|
||||
# Write urls.jsonl
|
||||
with open('urls.jsonl', 'w') as f:
|
||||
for found_url in sorted(urls_found):
|
||||
f.write(json.dumps({
|
||||
'type': 'Snapshot',
|
||||
'url': found_url,
|
||||
'plugin': PLUGIN_NAME,
|
||||
}) + '\n')
|
||||
# Emit ArchiveResult record to mark completion
|
||||
status = 'succeeded' if urls_found else 'skipped'
|
||||
output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
|
||||
ar_record = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output_str,
|
||||
}
|
||||
print(json.dumps(ar_record))
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs')
|
||||
click.echo(output_str, err=True)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
|
||||
@@ -32,17 +32,16 @@ https://www.iana.org/domains/reserved
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed: {result.stderr}"
|
||||
assert 'Found 3 URLs' in result.stdout
|
||||
assert 'Found 3 URLs' in result.stderr
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists()
|
||||
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
# Parse Snapshot records from stdout
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
|
||||
assert len(lines) == 3
|
||||
|
||||
urls = set()
|
||||
for line in lines:
|
||||
entry = json.loads(line)
|
||||
assert entry['type'] == 'Snapshot'
|
||||
assert 'url' in entry
|
||||
urls.add(entry['url'])
|
||||
|
||||
@@ -51,6 +50,10 @@ https://www.iana.org/domains/reserved
|
||||
assert 'https://example.com/page' in urls
|
||||
assert 'https://www.iana.org/domains/reserved' in urls
|
||||
|
||||
# Verify ArchiveResult record
|
||||
assert '"type": "ArchiveResult"' in result.stdout
|
||||
assert '"status": "succeeded"' in result.stdout
|
||||
|
||||
def test_extracts_urls_from_mixed_content(self, tmp_path):
|
||||
"""Test extracting URLs embedded in prose text."""
|
||||
input_file = tmp_path / 'mixed.txt'
|
||||
@@ -68,8 +71,7 @@ Also see https://github.com/user/repo for the code.
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
|
||||
urls = {json.loads(line)['url'] for line in lines}
|
||||
|
||||
assert 'https://blog.example.com/post' in urls
|
||||
@@ -92,15 +94,14 @@ Also see https://github.com/user/repo for the code.
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
|
||||
urls = {json.loads(line)['url'] for line in lines}
|
||||
|
||||
assert 'https://example.com/page' in urls
|
||||
assert any('wikipedia.org' in u for u in urls)
|
||||
|
||||
def test_exits_1_when_no_urls_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when no URLs found."""
|
||||
def test_skips_when_no_urls_found(self, tmp_path):
|
||||
"""Test that script returns skipped status when no URLs found."""
|
||||
input_file = tmp_path / 'empty.txt'
|
||||
input_file.write_text('no urls here, just plain text')
|
||||
|
||||
@@ -111,8 +112,9 @@ Also see https://github.com/user/repo for the code.
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert result.returncode == 0
|
||||
assert 'No URLs found' in result.stderr
|
||||
assert '"status": "skipped"' in result.stdout
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when file doesn't exist."""
|
||||
@@ -144,12 +146,11 @@ https://other.com
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_appends_to_existing_file(self, tmp_path):
|
||||
"""Test that output creates urls.jsonl with extracted URLs."""
|
||||
def test_outputs_to_stdout(self, tmp_path):
|
||||
"""Test that output goes to stdout in JSONL format."""
|
||||
input_file = tmp_path / 'urls.txt'
|
||||
input_file.write_text('https://new.com\nhttps://other.com')
|
||||
|
||||
@@ -161,8 +162,7 @@ https://other.com
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
|
||||
assert len(lines) == 2
|
||||
|
||||
urls = {json.loads(line)['url'] for line in lines}
|
||||
@@ -182,11 +182,11 @@ https://other.com
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
|
||||
entry = json.loads(lines[0])
|
||||
assert entry['url'] == 'https://example.com'
|
||||
assert 'type' in entry
|
||||
assert 'plugin' in entry
|
||||
assert entry['type'] == 'Snapshot'
|
||||
assert entry['plugin'] == 'parse_txt_urls'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user