Add htmltotext extractor

Saves HTML text nodes and selected element attributes in `htmltotext.txt` for each Snapshot. Primarily intended to be used for search indexing.
2026-01-04 09:55:33 +10:00 · 2023-10-23 21:42:25 -04:00
parent 6555719489
commit 310b4d1242
9 changed files with 203 additions and 104 deletions
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -143,7 +143,7 @@ def snapshot_icons(snapshot) -> str:
            "mercury": "🅼",
            "warc": "📦"
        }
-        exclude = ["favicon", "title", "headers", "archive_org"]
+        exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
        # Missing specific entry for WARC

        extractor_outputs = defaultdict(lambda: None)
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -429,6 +429,7 @@ class Link:
            'singlefile_path': 'singlefile.html',
            'readability_path': 'readability/content.html',
            'mercury_path': 'mercury/content.html',
+            'htmltotext_path': 'htmltotext.txt',
            'pdf_path': 'output.pdf',
            'screenshot_path': 'screenshot.png',
            'dom_path': 'output.html',
@@ -452,6 +453,7 @@ class Link:
                'singlefile_path': static_path,
                'readability_path': static_path,
                'mercury_path': static_path,
+                'htmltotext_path': static_path,
            })
        return canonical