2024-02-29 20:09:43 +00:00
|
|
|
{ lib
|
|
|
|
, buildPythonPackage
|
|
|
|
, fetchPypi
|
|
|
|
, pytestCheckHook
|
|
|
|
, pythonOlder
|
|
|
|
, certifi
|
|
|
|
, charset-normalizer
|
|
|
|
, courlan
|
|
|
|
, htmldate
|
|
|
|
, justext
|
|
|
|
, lxml
|
|
|
|
, urllib3
|
2024-04-21 15:54:59 +00:00
|
|
|
, setuptools
|
2024-02-29 20:09:43 +00:00
|
|
|
}:
|
|
|
|
|
|
|
|
buildPythonPackage rec {
|
|
|
|
pname = "trafilatura";
|
2024-04-21 15:54:59 +00:00
|
|
|
version = "1.8.1";
|
|
|
|
pyproject = true;
|
2024-02-29 20:09:43 +00:00
|
|
|
|
2024-04-21 15:54:59 +00:00
|
|
|
disabled = pythonOlder "3.9";
|
2024-02-29 20:09:43 +00:00
|
|
|
|
|
|
|
src = fetchPypi {
|
|
|
|
inherit pname version;
|
2024-04-21 15:54:59 +00:00
|
|
|
hash = "sha256-a4eN/b1cXftV0Pgwfyt9wVrDRYBU90hh/5ihcvXjhyA=";
|
2024-02-29 20:09:43 +00:00
|
|
|
};
|
|
|
|
|
2024-04-21 15:54:59 +00:00
|
|
|
nativeBuildInputs = [
|
|
|
|
setuptools
|
|
|
|
];
|
|
|
|
|
2024-02-29 20:09:43 +00:00
|
|
|
propagatedBuildInputs = [
|
|
|
|
certifi
|
|
|
|
charset-normalizer
|
|
|
|
courlan
|
|
|
|
htmldate
|
|
|
|
justext
|
|
|
|
lxml
|
|
|
|
urllib3
|
|
|
|
];
|
|
|
|
|
2024-04-21 15:54:59 +00:00
|
|
|
nativeCheckInputs = [
|
|
|
|
pytestCheckHook
|
|
|
|
];
|
2024-02-29 20:09:43 +00:00
|
|
|
|
|
|
|
disabledTests = [
|
2024-04-21 15:54:59 +00:00
|
|
|
# Disable tests that require an internet connection
|
2024-02-29 20:09:43 +00:00
|
|
|
"test_download"
|
|
|
|
"test_fetch"
|
|
|
|
"test_redirection"
|
|
|
|
"test_meta_redirections"
|
|
|
|
"test_crawl_page"
|
|
|
|
"test_whole"
|
|
|
|
"test_probing"
|
|
|
|
"test_cli_pipeline"
|
|
|
|
];
|
|
|
|
|
|
|
|
# patch out gui cli because it is not supported in this packaging
|
|
|
|
# nixify path to the trafilatura binary in the test suite
|
|
|
|
postPatch = ''
|
2024-04-21 15:54:59 +00:00
|
|
|
substituteInPlace setup.py \
|
|
|
|
--replace-fail '"trafilatura_gui=trafilatura.gui:main",' ""
|
|
|
|
substituteInPlace tests/cli_tests.py \
|
|
|
|
--replace-fail "trafilatura_bin = 'trafilatura'" "trafilatura_bin = '$out/bin/trafilatura'"
|
2024-02-29 20:09:43 +00:00
|
|
|
'';
|
|
|
|
|
2024-04-21 15:54:59 +00:00
|
|
|
pythonImportsCheck = [
|
|
|
|
"trafilatura"
|
|
|
|
];
|
2024-02-29 20:09:43 +00:00
|
|
|
|
|
|
|
meta = with lib; {
|
|
|
|
description = "Python package and command-line tool designed to gather text on the Web";
|
|
|
|
homepage = "https://trafilatura.readthedocs.io";
|
|
|
|
changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md";
|
2024-04-21 15:54:59 +00:00
|
|
|
license = licenses.asl20;
|
2024-02-29 20:09:43 +00:00
|
|
|
maintainers = with maintainers; [ jokatzke ];
|
2024-04-21 15:54:59 +00:00
|
|
|
mainProgram = "trafilatura";
|
2024-02-29 20:09:43 +00:00
|
|
|
};
|
|
|
|
}
|