Use npm package for readability extractor installs

This commit is contained in:
Nick Sweeting
2026-03-15 13:09:18 -07:00
parent 957387fd88
commit 2585ef5870
2 changed files with 37 additions and 0 deletions

View File

@@ -347,6 +347,22 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
binary_overrides = record.get('overrides', {})
normalized_overrides = binary_overrides if isinstance(binary_overrides, dict) else {}
# abx-plugins currently emits a GitHub install URL for readability-extractor,
# but the package is published on npm. Prefer the registry package to avoid
# long git-based installs in CI while still using canonical install_args.
if (
name == 'readability-extractor'
and isinstance(normalized_overrides.get('npm'), dict)
and normalized_overrides['npm'].get('install_args') == ['https://github.com/ArchiveBox/readability-extractor']
):
normalized_overrides = {
**normalized_overrides,
'npm': {
**normalized_overrides['npm'],
'install_args': ['readability-extractor'],
},
}
# Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders
# This happens when on_Crawl hooks detect already-installed binaries
abspath = record.get('abspath')

View File

@@ -229,6 +229,27 @@ class TestBinaryModel(TestCase):
self.assertEqual(binary.overrides, overrides)
def test_binary_from_json_prefers_published_readability_package(self):
"""Binary.from_json() should rewrite readability's npm git URL to the published package."""
binary = Binary.from_json({
'name': 'readability-extractor',
'binproviders': 'env,npm',
'overrides': {
'npm': {
'install_args': ['https://github.com/ArchiveBox/readability-extractor'],
},
},
})
self.assertEqual(
binary.overrides,
{
'npm': {
'install_args': ['readability-extractor'],
},
},
)
class TestBinaryStateMachine(TestCase):
"""Test the BinaryMachine state machine."""