mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-04 18:05:36 +10:00
Fix output path structure for 0.9.x data directory
- Update Crawl.output_dir_parent to use username instead of user_id
for consistency with Snapshot paths
- Add domain from first URL to Crawl path structure for easier debugging:
users/{username}/crawls/YYYYMMDD/{domain}/{crawl_id}/
- Add CRAWL_OUTPUT_DIR to config passed to Snapshot hooks so chrome_tab
can find the shared Chrome session from the Crawl
- Update comment in chrome_tab hook to reflect new config source
This commit is contained in:
@@ -220,6 +220,10 @@ def get_config(
|
|||||||
if crawl and hasattr(crawl, "config") and crawl.config:
|
if crawl and hasattr(crawl, "config") and crawl.config:
|
||||||
config.update(crawl.config)
|
config.update(crawl.config)
|
||||||
|
|
||||||
|
# Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
|
||||||
|
if crawl and hasattr(crawl, "OUTPUT_DIR"):
|
||||||
|
config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR)
|
||||||
|
|
||||||
# Apply snapshot config overrides (highest priority)
|
# Apply snapshot config overrides (highest priority)
|
||||||
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
||||||
config.update(snapshot.config)
|
config.update(snapshot.config)
|
||||||
|
|||||||
@@ -250,11 +250,45 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
)
|
)
|
||||||
return crawl
|
return crawl
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_domain_from_url(url: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract domain from URL for path structure.
|
||||||
|
Uses full hostname with sanitized special chars.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
https://example.com:8080 → example.com_8080
|
||||||
|
https://sub.example.com → sub.example.com
|
||||||
|
file:///path → localhost
|
||||||
|
data:text/html → data
|
||||||
|
"""
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
|
||||||
|
if parsed.scheme in ('http', 'https'):
|
||||||
|
if parsed.port:
|
||||||
|
return f"{parsed.hostname}_{parsed.port}".replace(':', '_')
|
||||||
|
return parsed.hostname or 'unknown'
|
||||||
|
elif parsed.scheme == 'file':
|
||||||
|
return 'localhost'
|
||||||
|
elif parsed.scheme:
|
||||||
|
return parsed.scheme
|
||||||
|
else:
|
||||||
|
return 'unknown'
|
||||||
|
except Exception:
|
||||||
|
return 'unknown'
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def output_dir_parent(self) -> str:
|
def output_dir_parent(self) -> str:
|
||||||
"""Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}"""
|
"""Construct parent directory: users/{username}/crawls/{YYYYMMDD}/{domain}"""
|
||||||
date_str = self.created_at.strftime('%Y%m%d')
|
date_str = self.created_at.strftime('%Y%m%d')
|
||||||
return f'users/{self.created_by_id}/crawls/{date_str}'
|
username = self.created_by.username
|
||||||
|
# Get domain from first URL
|
||||||
|
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||||
|
domain = self.extract_domain_from_url(first_url) if first_url else 'unknown'
|
||||||
|
return f'users/{username}/crawls/{date_str}/{domain}'
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def output_dir_name(self) -> str:
|
def output_dir_name(self) -> str:
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ process.on('SIGINT', cleanup);
|
|||||||
function findCrawlChromeSession(crawlId) {
|
function findCrawlChromeSession(crawlId) {
|
||||||
if (!crawlId) return null;
|
if (!crawlId) return null;
|
||||||
|
|
||||||
// Use CRAWL_OUTPUT_DIR env var set by hooks.py
|
// Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
|
||||||
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
|
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
|
||||||
if (!crawlOutputDir) return null;
|
if (!crawlOutputDir) return null;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user