2024-05-15 15:35:15 +00:00
|
|
|
{
|
|
|
|
lib,
|
|
|
|
buildPythonPackage,
|
|
|
|
certifi,
|
|
|
|
charset-normalizer,
|
|
|
|
courlan,
|
|
|
|
fetchPypi,
|
|
|
|
htmldate,
|
|
|
|
justext,
|
|
|
|
lxml,
|
|
|
|
pytestCheckHook,
|
|
|
|
pythonOlder,
|
|
|
|
setuptools,
|
|
|
|
urllib3,
|
2024-02-29 20:09:43 +00:00
|
|
|
}:
|
|
|
|
|
|
|
|
buildPythonPackage rec {
|
|
|
|
pname = "trafilatura";
|
2024-05-15 15:35:15 +00:00
|
|
|
version = "1.9.0";
|
2024-04-21 15:54:59 +00:00
|
|
|
pyproject = true;
|
2024-02-29 20:09:43 +00:00
|
|
|
|
2024-04-21 15:54:59 +00:00
|
|
|
disabled = pythonOlder "3.9";
|
2024-02-29 20:09:43 +00:00
|
|
|
|
|
|
|
src = fetchPypi {
|
|
|
|
inherit pname version;
|
2024-05-15 15:35:15 +00:00
|
|
|
hash = "sha256-5oM9KauKE+2FOTfXyR5oaLxi774QIUrCsQZDbdI9FBI=";
|
2024-02-29 20:09:43 +00:00
|
|
|
};
|
|
|
|
|
2024-05-15 15:35:15 +00:00
|
|
|
# Patch out gui cli because it is not supported in this packaging and
|
|
|
|
# nixify path to the trafilatura binary in the test suite
|
|
|
|
postPatch = ''
|
|
|
|
substituteInPlace setup.py \
|
|
|
|
--replace-fail '"trafilatura_gui=trafilatura.gui:main",' ""
|
|
|
|
substituteInPlace tests/cli_tests.py \
|
|
|
|
--replace-fail "trafilatura_bin = 'trafilatura'" "trafilatura_bin = '$out/bin/trafilatura'"
|
|
|
|
'';
|
|
|
|
|
|
|
|
build-system = [ setuptools ];
|
2024-04-21 15:54:59 +00:00
|
|
|
|
2024-05-15 15:35:15 +00:00
|
|
|
dependencies = [
|
2024-02-29 20:09:43 +00:00
|
|
|
certifi
|
|
|
|
charset-normalizer
|
|
|
|
courlan
|
|
|
|
htmldate
|
|
|
|
justext
|
|
|
|
lxml
|
|
|
|
urllib3
|
|
|
|
];
|
|
|
|
|
2024-05-15 15:35:15 +00:00
|
|
|
nativeCheckInputs = [ pytestCheckHook ];
|
2024-02-29 20:09:43 +00:00
|
|
|
|
|
|
|
disabledTests = [
|
2024-04-21 15:54:59 +00:00
|
|
|
# Disable tests that require an internet connection
|
2024-05-15 15:35:15 +00:00
|
|
|
"test_cli_pipeline"
|
|
|
|
"test_crawl_page"
|
2024-02-29 20:09:43 +00:00
|
|
|
"test_download"
|
|
|
|
"test_fetch"
|
|
|
|
"test_meta_redirections"
|
|
|
|
"test_probing"
|
2024-05-15 15:35:15 +00:00
|
|
|
"test_queue"
|
|
|
|
"test_redirection"
|
|
|
|
"test_whole"
|
2024-02-29 20:09:43 +00:00
|
|
|
];
|
|
|
|
|
2024-05-15 15:35:15 +00:00
|
|
|
pythonImportsCheck = [ "trafilatura" ];
|
2024-02-29 20:09:43 +00:00
|
|
|
|
|
|
|
meta = with lib; {
|
|
|
|
description = "Python package and command-line tool designed to gather text on the Web";
|
|
|
|
homepage = "https://trafilatura.readthedocs.io";
|
|
|
|
changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md";
|
2024-04-21 15:54:59 +00:00
|
|
|
license = licenses.asl20;
|
2024-02-29 20:09:43 +00:00
|
|
|
maintainers = with maintainers; [ jokatzke ];
|
2024-04-21 15:54:59 +00:00
|
|
|
mainProgram = "trafilatura";
|
2024-02-29 20:09:43 +00:00
|
|
|
};
|
|
|
|
}
|