mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
163 lines
5.5 KiB
Python
163 lines
5.5 KiB
Python
# Generated by Django 3.0.8 on 2020-11-04 12:25
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from django.db import migrations, models
|
|
import django.db.models.deletion
|
|
|
|
# Handle old vs new import paths
|
|
try:
|
|
from archivebox.config import CONSTANTS
|
|
|
|
ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
|
|
except ImportError:
|
|
try:
|
|
from archivebox.config import CONFIG
|
|
|
|
ARCHIVE_DIR = Path(CONFIG.get("ARCHIVE_DIR", "./archive"))
|
|
except ImportError:
|
|
ARCHIVE_DIR = Path("./archive")
|
|
|
|
try:
|
|
from archivebox.misc.util import to_json
|
|
except ImportError:
|
|
try:
|
|
from index.json import to_json
|
|
except ImportError:
|
|
to_json = lambda x: json.dumps(x, indent=4, default=str)
|
|
|
|
try:
|
|
JSONField = models.JSONField
|
|
except AttributeError:
|
|
import jsonfield
|
|
|
|
JSONField = jsonfield.JSONField
|
|
|
|
|
|
def forwards_func(apps, schema_editor):
|
|
Snapshot = apps.get_model("core", "Snapshot")
|
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
|
|
|
snapshots = Snapshot.objects.all()
|
|
for snapshot in snapshots:
|
|
out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp
|
|
|
|
try:
|
|
with open(out_dir / "index.json") as f:
|
|
fs_index = json.load(f)
|
|
except Exception:
|
|
continue
|
|
|
|
history = fs_index["history"]
|
|
|
|
for extractor in history:
|
|
for result in history[extractor]:
|
|
try:
|
|
ArchiveResult.objects.create(
|
|
extractor=extractor,
|
|
snapshot=snapshot,
|
|
pwd=result["pwd"],
|
|
cmd=result.get("cmd") or [],
|
|
cmd_version=result.get("cmd_version") or "unknown",
|
|
start_ts=result["start_ts"],
|
|
end_ts=result["end_ts"],
|
|
status=result["status"],
|
|
output=result.get("output") or "null",
|
|
)
|
|
except Exception as e:
|
|
print(
|
|
" ! Skipping import due to missing/invalid index.json:",
|
|
out_dir,
|
|
e,
|
|
"(open an issue with this index.json for help)",
|
|
)
|
|
|
|
|
|
def verify_json_index_integrity(snapshot):
|
|
results = snapshot.archiveresult_set.all()
|
|
out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp
|
|
with open(out_dir / "index.json") as f:
|
|
index = json.load(f)
|
|
|
|
history = index["history"]
|
|
index_results = [result for extractor in history for result in history[extractor]]
|
|
flattened_results = [result["start_ts"] for result in index_results]
|
|
|
|
missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
|
|
|
|
for missing in missing_results:
|
|
index["history"][missing.extractor].append(
|
|
{
|
|
"cmd": missing.cmd,
|
|
"cmd_version": missing.cmd_version,
|
|
"end_ts": missing.end_ts.isoformat(),
|
|
"start_ts": missing.start_ts.isoformat(),
|
|
"pwd": missing.pwd,
|
|
"output": missing.output,
|
|
"schema": "ArchiveResult",
|
|
"status": missing.status,
|
|
},
|
|
)
|
|
|
|
json_index = to_json(index)
|
|
with open(out_dir / "index.json", "w") as f:
|
|
f.write(json_index)
|
|
|
|
|
|
def reverse_func(apps, schema_editor):
|
|
Snapshot = apps.get_model("core", "Snapshot")
|
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
|
for snapshot in Snapshot.objects.all():
|
|
verify_json_index_integrity(snapshot)
|
|
|
|
ArchiveResult.objects.all().delete()
|
|
|
|
|
|
class Migration(migrations.Migration):
|
|
dependencies = [
|
|
("core", "0006_auto_20201012_1520"),
|
|
]
|
|
|
|
operations = [
|
|
migrations.CreateModel(
|
|
name="ArchiveResult",
|
|
fields=[
|
|
("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
|
("cmd", JSONField()),
|
|
("pwd", models.CharField(max_length=256)),
|
|
("cmd_version", models.CharField(max_length=32)),
|
|
(
|
|
"status",
|
|
models.CharField(choices=[("succeeded", "succeeded"), ("failed", "failed"), ("skipped", "skipped")], max_length=16),
|
|
),
|
|
("output", models.CharField(max_length=512)),
|
|
("start_ts", models.DateTimeField()),
|
|
("end_ts", models.DateTimeField()),
|
|
(
|
|
"extractor",
|
|
models.CharField(
|
|
choices=[
|
|
("title", "title"),
|
|
("favicon", "favicon"),
|
|
("wget", "wget"),
|
|
("singlefile", "singlefile"),
|
|
("pdf", "pdf"),
|
|
("screenshot", "screenshot"),
|
|
("dom", "dom"),
|
|
("readability", "readability"),
|
|
("mercury", "mercury"),
|
|
("git", "git"),
|
|
("media", "media"),
|
|
("headers", "headers"),
|
|
("archivedotorg", "archivedotorg"),
|
|
],
|
|
max_length=32,
|
|
),
|
|
),
|
|
("snapshot", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="core.Snapshot")),
|
|
],
|
|
),
|
|
migrations.RunPython(forwards_func, reverse_func),
|
|
]
|