diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 1abdbfc4..9629b3aa 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -347,6 +347,22 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): binary_overrides = record.get('overrides', {}) normalized_overrides = binary_overrides if isinstance(binary_overrides, dict) else {} + # abx-plugins currently emits a GitHub install URL for readability-extractor, + # but the package is published on npm. Prefer the registry package to avoid + # long git-based installs in CI while still using canonical install_args. + if ( + name == 'readability-extractor' + and isinstance(normalized_overrides.get('npm'), dict) + and normalized_overrides['npm'].get('install_args') == ['https://github.com/ArchiveBox/readability-extractor'] + ): + normalized_overrides = { + **normalized_overrides, + 'npm': { + **normalized_overrides['npm'], + 'install_args': ['readability-extractor'], + }, + } + # Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders # This happens when on_Crawl hooks detect already-installed binaries abspath = record.get('abspath') diff --git a/archivebox/machine/tests/test_machine_models.py b/archivebox/machine/tests/test_machine_models.py index 4904f1a8..6a1d4514 100644 --- a/archivebox/machine/tests/test_machine_models.py +++ b/archivebox/machine/tests/test_machine_models.py @@ -229,6 +229,27 @@ class TestBinaryModel(TestCase): self.assertEqual(binary.overrides, overrides) + def test_binary_from_json_prefers_published_readability_package(self): + """Binary.from_json() should rewrite readability's npm git URL to the published package.""" + binary = Binary.from_json({ + 'name': 'readability-extractor', + 'binproviders': 'env,npm', + 'overrides': { + 'npm': { + 'install_args': ['https://github.com/ArchiveBox/readability-extractor'], + }, + }, + }) + + self.assertEqual( + binary.overrides, + { + 'npm': { + 'install_args': ['readability-extractor'], + }, + }, + ) + class TestBinaryStateMachine(TestCase): """Test the BinaryMachine state machine."""