mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-04 09:55:33 +10:00
Add htmltotext extractor
Saves HTML text nodes and selected element attributes in `htmltotext.txt` for each Snapshot. Primarily intended to be used for search indexing.
This commit is contained in:
@@ -143,7 +143,7 @@ def snapshot_icons(snapshot) -> str:
|
||||
"mercury": "🅼",
|
||||
"warc": "📦"
|
||||
}
|
||||
exclude = ["favicon", "title", "headers", "archive_org"]
|
||||
exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
|
||||
# Missing specific entry for WARC
|
||||
|
||||
extractor_outputs = defaultdict(lambda: None)
|
||||
|
||||
@@ -429,6 +429,7 @@ class Link:
|
||||
'singlefile_path': 'singlefile.html',
|
||||
'readability_path': 'readability/content.html',
|
||||
'mercury_path': 'mercury/content.html',
|
||||
'htmltotext_path': 'htmltotext.txt',
|
||||
'pdf_path': 'output.pdf',
|
||||
'screenshot_path': 'screenshot.png',
|
||||
'dom_path': 'output.html',
|
||||
@@ -452,6 +453,7 @@ class Link:
|
||||
'singlefile_path': static_path,
|
||||
'readability_path': static_path,
|
||||
'mercury_path': static_path,
|
||||
'htmltotext_path': static_path,
|
||||
})
|
||||
return canonical
|
||||
|
||||
|
||||
Reference in New Issue
Block a user