From 4fd7fcdbcfcee73980522ce7ac284e84adab218b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 26 Dec 2025 11:55:03 -0800 Subject: [PATCH] new gallerydl plugin and more --- .claude/settings.local.json | 5 +- PLUGIN_ENHANCEMENTS.md | 300 ---- SIMPLIFICATION_PLAN.md | 819 ---------- STORAGE_CAS_PLAN.md | 1341 +++++++++++++++++ TEST_RESULTS.md | 127 -- archivebox/Architecture.md | 2 +- archivebox/ArchiveBox.conf | 3 - .../BACKGROUND_HOOKS_IMPLEMENTATION_PLAN.md | 1152 ++++++++++++++ archivebox/core/admin_archiveresults.py | 22 +- archivebox/core/models.py | 145 +- archivebox/core/views.py | 63 +- archivebox/hooks.py | 95 +- archivebox/mcp/TEST_RESULTS.md | 181 +++ archivebox/misc/logging_util.py | 20 +- .../consolelog/on_Snapshot__21_consolelog.js | 51 +- archivebox/plugins/gallerydl/config.json | 45 + .../on_Crawl__00_validate_gallerydl.py | 129 ++ .../gallerydl/on_Snapshot__52_gallerydl.py | 299 ++++ .../responses/on_Snapshot__24_responses.js | 76 +- archivebox/plugins/ssl/on_Snapshot__23_ssl.js | 55 +- 20 files changed, 3495 insertions(+), 1435 deletions(-) delete mode 100644 PLUGIN_ENHANCEMENTS.md delete mode 100644 SIMPLIFICATION_PLAN.md create mode 100644 STORAGE_CAS_PLAN.md delete mode 100644 TEST_RESULTS.md delete mode 100644 archivebox/ArchiveBox.conf create mode 100644 archivebox/BACKGROUND_HOOKS_IMPLEMENTATION_PLAN.md create mode 100644 archivebox/mcp/TEST_RESULTS.md create mode 100644 archivebox/plugins/gallerydl/config.json create mode 100755 archivebox/plugins/gallerydl/on_Crawl__00_validate_gallerydl.py create mode 100755 archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py diff --git a/.claude/settings.local.json b/.claude/settings.local.json index ac196f40..70293cbd 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -6,7 +6,10 @@ "Bash(xargs:*)", "Bash(python -c:*)", "Bash(printf:*)", - "Bash(pkill:*)" + "Bash(pkill:*)", + "Bash(python3:*)", + "Bash(sqlite3:*)", + "WebFetch(domain:github.com)" ] } } diff --git a/PLUGIN_ENHANCEMENTS.md b/PLUGIN_ENHANCEMENTS.md deleted file mode 100644 index ade53064..00000000 --- a/PLUGIN_ENHANCEMENTS.md +++ /dev/null @@ -1,300 +0,0 @@ -# JS Implementation Features to Port to Python ArchiveBox - -## Priority: High Impact Features - -### 1. **Screen Recording** ⭐⭐⭐ -**JS Implementation:** Captures MP4 video + animated GIF of the archiving session -```javascript -// Records browser activity including scrolling, interactions -PuppeteerScreenRecorder → screenrecording.mp4 -ffmpeg conversion → screenrecording.gif (first 10s, optimized) -``` - -**Enhancement for Python:** -- Add `on_Snapshot__24_screenrecording.py` -- Use puppeteer or playwright screen recording APIs -- Generate both full MP4 and thumbnail GIF -- **Value:** Visual proof of what was captured, useful for QA and debugging - -### 2. **AI Quality Assurance** ⭐⭐⭐ -**JS Implementation:** Uses GPT-4o to analyze screenshots and validate archive quality -```javascript -// ai_qa.py analyzes screenshot.png and returns: -{ - "pct_visible": 85, - "warnings": ["Some content may be cut off"], - "main_content_title": "Article Title", - "main_content_author": "Author Name", - "main_content_date": "2024-01-15", - "website_brand_name": "Example.com" -} -``` - -**Enhancement for Python:** -- Add `on_Snapshot__95_aiqa.py` (runs after screenshot) -- Integrate with OpenAI API or local vision models -- Validates: content visibility, broken layouts, CAPTCHA blocks, error pages -- **Value:** Automatic detection of failed archives, quality scoring - -### 3. **Network Response Archiving** ⭐⭐⭐ -**JS Implementation:** Saves ALL network responses in organized structure -``` -responses/ -├── all/ # Timestamped unique files -│ ├── 20240101120000__GET__https%3A%2F%2Fexample.com%2Fapi.json -│ └── ... -├── script/ # Organized by resource type -│ └── example.com/path/to/script.js → ../all/... -├── stylesheet/ -├── image/ -├── media/ -└── index.jsonl # Searchable index -``` - -**Enhancement for Python:** -- Add `on_Snapshot__23_responses.py` -- Save all HTTP responses (XHR, images, scripts, etc.) -- Create both timestamped and URL-organized views via symlinks -- Generate `index.jsonl` with metadata (URL, method, status, mimeType, sha256) -- **Value:** Complete HTTP-level archive, better debugging, API response preservation - -### 4. **Detailed Metadata Extractors** ⭐⭐ - -#### 4a. SSL/TLS Details (`on_Snapshot__16_ssl.py`) -```python -{ - "protocol": "TLS 1.3", - "cipher": "AES_128_GCM", - "securityState": "secure", - "securityDetails": { - "issuer": "Let's Encrypt", - "validFrom": ..., - "validTo": ... - } -} -``` - -#### 4b. SEO Metadata (`on_Snapshot__17_seo.py`) -Extracts all `` tags: -```python -{ - "og:title": "Page Title", - "og:image": "https://example.com/image.jpg", - "twitter:card": "summary_large_image", - "description": "Page description", - ... -} -``` - -#### 4c. Accessibility Tree (`on_Snapshot__18_accessibility.py`) -```python -{ - "headings": ["# Main Title", "## Section 1", ...], - "iframes": ["https://embed.example.com/..."], - "tree": { ... } # Full accessibility snapshot -} -``` - -#### 4d. Outlinks Categorization (`on_Snapshot__19_outlinks.py`) -Better than current implementation - categorizes by type: -```python -{ - "hrefs": [...], # All links - "images": [...], # - "css_stylesheets": [...], # - "js_scripts": [...], #