mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 06:17:53 +10:00
Merge branch 'dev' into method_allow_deny
This commit is contained in:
@@ -9,6 +9,7 @@ from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
chrome_cleanup,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
@@ -57,6 +58,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
chrome_cleanup()
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
chrome_cleanup,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
@@ -54,6 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
chrome_cleanup()
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
try:
|
||||
result_json = json.loads(result.stdout)
|
||||
assert result_json and 'content' in result_json
|
||||
assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
|
||||
except json.JSONDecodeError:
|
||||
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
|
||||
|
||||
@@ -85,7 +85,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
output_tail = [
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
|
||||
if line.strip()
|
||||
]
|
||||
hints = (
|
||||
|
||||
@@ -9,6 +9,7 @@ from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
chrome_cleanup,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
@@ -54,6 +55,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
chrome_cleanup()
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ from ..logging_util import TimedProgress
|
||||
|
||||
HTML_TITLE_REGEX = re.compile(
|
||||
r'<title.*?>' # start matching text after <title> tag
|
||||
r'(.[^<>]+)', # get everything up to these symbols
|
||||
r'([^<>]+)', # get everything up to these symbols
|
||||
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user