fix: Improve headers handling

This commit is contained in:
Cristian
2020-09-24 08:37:27 -05:00
committed by Cristian Vargas
parent a40af98ced
commit 62ed11a5ca
6 changed files with 60 additions and 3 deletions

View File

@@ -85,6 +85,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)},
'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},

View File

@@ -16,6 +16,7 @@ from ..config import (
CURL_USER_AGENT,
CURL_VERSION,
CHECK_SSL_VALIDITY,
SAVE_HEADERS
)
from ..logging_util import TimedProgress
@@ -24,7 +25,7 @@ def should_save_headers(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
output = Path(out_dir or link.link_dir) / 'headers.json'
return not output.exists()
return not output.exists() and SAVE_HEADERS
@enforce_types

View File

@@ -186,13 +186,17 @@ def get_headers(url: str, timeout: int=None) -> str:
headers={'User-Agent': WGET_USER_AGENT},
verify=CHECK_SSL_VALIDITY,
timeout=timeout,
allow_redirects=True
)
if response.status_code >= 400:
raise RequestException
except RequestException:
response = requests.get(
url,
headers={'User-Agent': WGET_USER_AGENT},
verify=CHECK_SSL_VALIDITY,
timeout=timeout,
stream=True
)
return pyjson.dumps(dict(response.headers), indent=4)