mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script
This commit is contained in:
@@ -23,7 +23,7 @@ const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'parse_dom_outlinks';
|
||||
const PLUGIN_NAME = 'parse_dom_outlinks';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'outlinks.json';
|
||||
const URLS_FILE = 'urls.jsonl'; // For crawl system
|
||||
@@ -190,7 +190,7 @@ async function extractOutlinks(url) {
|
||||
const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
|
||||
type: 'Snapshot',
|
||||
url: href,
|
||||
via_extractor: EXTRACTOR_NAME,
|
||||
plugin: PLUGIN_NAME,
|
||||
})).join('\n');
|
||||
|
||||
if (urlsJsonl) {
|
||||
@@ -236,10 +236,14 @@ async function main() {
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
// Check if Chrome session exists, then wait for page load
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await extractOutlinks(url);
|
||||
|
||||
Reference in New Issue
Block a user