Review output file paths and data directory structure (#1736)

- Update Crawl.output_dir_parent to use username instead of user_id for
consistency with Snapshot paths
- Add domain from first URL to Crawl path structure for easier
debugging: users/{username}/crawls/YYYYMMDD/{domain}/{crawl_id}/
- Add CRAWL_OUTPUT_DIR to config passed to Snapshot hooks so chrome_tab
can find the shared Chrome session from the Crawl
- Update comment in chrome_tab hook to reflect new config source

<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->

# Summary

<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->

# Related issues

<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->

# Changes these areas

- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk
This commit is contained in:
Nick Sweeting
2025-12-31 00:19:03 -08:00
committed by GitHub
3 changed files with 41 additions and 3 deletions

View File

@@ -220,6 +220,10 @@ def get_config(
if crawl and hasattr(crawl, "config") and crawl.config:
config.update(crawl.config)
# Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
if crawl and hasattr(crawl, "OUTPUT_DIR"):
config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR)
# Apply snapshot config overrides (highest priority)
if snapshot and hasattr(snapshot, "config") and snapshot.config:
config.update(snapshot.config)

View File

@@ -250,11 +250,45 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
)
return crawl
@staticmethod
def extract_domain_from_url(url: str) -> str:
"""
Extract domain from URL for path structure.
Uses full hostname with sanitized special chars.
Examples:
https://example.com:8080 → example.com_8080
https://sub.example.com → sub.example.com
file:///path → localhost
data:text/html → data
"""
from urllib.parse import urlparse
try:
parsed = urlparse(url)
if parsed.scheme in ('http', 'https'):
if parsed.port:
return f"{parsed.hostname}_{parsed.port}".replace(':', '_')
return parsed.hostname or 'unknown'
elif parsed.scheme == 'file':
return 'localhost'
elif parsed.scheme:
return parsed.scheme
else:
return 'unknown'
except Exception:
return 'unknown'
@property
def output_dir_parent(self) -> str:
"""Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}"""
"""Construct parent directory: users/{username}/crawls/{YYYYMMDD}/{domain}"""
date_str = self.created_at.strftime('%Y%m%d')
return f'users/{self.created_by_id}/crawls/{date_str}'
username = self.created_by.username
# Get domain from first URL
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
domain = self.extract_domain_from_url(first_url) if first_url else 'unknown'
return f'users/{username}/crawls/{date_str}/{domain}'
@property
def output_dir_name(self) -> str:

View File

@@ -89,7 +89,7 @@ process.on('SIGINT', cleanup);
function findCrawlChromeSession(crawlId) {
if (!crawlId) return null;
// Use CRAWL_OUTPUT_DIR env var set by hooks.py
// Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
if (!crawlOutputDir) return null;