Review output file paths and data directory structure (#1736)

- Update Crawl.output_dir_parent to use username instead of user_id for consistency with Snapshot paths - Add domain from first URL to Crawl path structure for easier debugging: users/{username}/crawls/YYYYMMDD/{domain}/{crawl_id}/ - Add CRAWL_OUTPUT_DIR to config passed to Snapshot hooks so chrome_tab can find the shared Chrome session from the Crawl - Update comment in chrome_tab hook to reflect new config source  # Summary  # Related issues  # Changes these areas - [ ] Bugfixes - [ ] Feature behavior - [ ] Command line interface - [ ] Configuration options - [ ] Internal architecture - [ ] Snapshot data layout on disk
2026-01-03 01:15:57 +10:00 · 2025-12-31 00:19:03 -08:00
parent cead22afc2 04c23badc2
commit 987f4fbe0a
3 changed files with 41 additions and 3 deletions
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -220,6 +220,10 @@ def get_config(
    if crawl and hasattr(crawl, "config") and crawl.config:
        config.update(crawl.config)

+    # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
+    if crawl and hasattr(crawl, "OUTPUT_DIR"):
+        config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR)
+
    # Apply snapshot config overrides (highest priority)
    if snapshot and hasattr(snapshot, "config") and snapshot.config:
        config.update(snapshot.config)
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -250,11 +250,45 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        )
        return crawl

+    @staticmethod
+    def extract_domain_from_url(url: str) -> str:
+        """
+        Extract domain from URL for path structure.
+        Uses full hostname with sanitized special chars.
+
+        Examples:
+            https://example.com:8080 → example.com_8080
+            https://sub.example.com → sub.example.com
+            file:///path → localhost
+            data:text/html → data
+        """
+        from urllib.parse import urlparse
+
+        try:
+            parsed = urlparse(url)
+
+            if parsed.scheme in ('http', 'https'):
+                if parsed.port:
+                    return f"{parsed.hostname}_{parsed.port}".replace(':', '_')
+                return parsed.hostname or 'unknown'
+            elif parsed.scheme == 'file':
+                return 'localhost'
+            elif parsed.scheme:
+                return parsed.scheme
+            else:
+                return 'unknown'
+        except Exception:
+            return 'unknown'
+
    @property
    def output_dir_parent(self) -> str:
-        """Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}"""
+        """Construct parent directory: users/{username}/crawls/{YYYYMMDD}/{domain}"""
        date_str = self.created_at.strftime('%Y%m%d')
-        return f'users/{self.created_by_id}/crawls/{date_str}'
+        username = self.created_by.username
+        # Get domain from first URL
+        first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
+        domain = self.extract_domain_from_url(first_url) if first_url else 'unknown'
+        return f'users/{username}/crawls/{date_str}/{domain}'

    @property
    def output_dir_name(self) -> str:
--- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
+++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
@@ -89,7 +89,7 @@ process.on('SIGINT', cleanup);
 function findCrawlChromeSession(crawlId) {
    if (!crawlId) return null;

-    // Use CRAWL_OUTPUT_DIR env var set by hooks.py
+    // Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
    const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
    if (!crawlOutputDir) return null;