diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index 00835ab7..7e56e22a 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -220,6 +220,10 @@ def get_config( if crawl and hasattr(crawl, "config") and crawl.config: config.update(crawl.config) + # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session + if crawl and hasattr(crawl, "OUTPUT_DIR"): + config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR) + # Apply snapshot config overrides (highest priority) if snapshot and hasattr(snapshot, "config") and snapshot.config: config.update(snapshot.config) diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 9e756f29..07971109 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -250,11 +250,45 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith ) return crawl + @staticmethod + def extract_domain_from_url(url: str) -> str: + """ + Extract domain from URL for path structure. + Uses full hostname with sanitized special chars. + + Examples: + https://example.com:8080 → example.com_8080 + https://sub.example.com → sub.example.com + file:///path → localhost + data:text/html → data + """ + from urllib.parse import urlparse + + try: + parsed = urlparse(url) + + if parsed.scheme in ('http', 'https'): + if parsed.port: + return f"{parsed.hostname}_{parsed.port}".replace(':', '_') + return parsed.hostname or 'unknown' + elif parsed.scheme == 'file': + return 'localhost' + elif parsed.scheme: + return parsed.scheme + else: + return 'unknown' + except Exception: + return 'unknown' + @property def output_dir_parent(self) -> str: - """Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}""" + """Construct parent directory: users/{username}/crawls/{YYYYMMDD}/{domain}""" date_str = self.created_at.strftime('%Y%m%d') - return f'users/{self.created_by_id}/crawls/{date_str}' + username = self.created_by.username + # Get domain from first URL + first_url = self.get_urls_list()[0] if self.get_urls_list() else '' + domain = self.extract_domain_from_url(first_url) if first_url else 'unknown' + return f'users/{username}/crawls/{date_str}/{domain}' @property def output_dir_name(self) -> str: diff --git a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js index 300bed51..592381cf 100755 --- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js +++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js @@ -89,7 +89,7 @@ process.on('SIGINT', cleanup); function findCrawlChromeSession(crawlId) { if (!crawlId) return null; - // Use CRAWL_OUTPUT_DIR env var set by hooks.py + // Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', ''); if (!crawlOutputDir) return null;