dev-python/nltk-data: Install corpora/pl196x unconditionally
authorMichał Górny <mgorny@gentoo.org>
Mon, 13 Apr 2020 09:26:40 +0000 (11:26 +0200)
committerMichał Górny <mgorny@gentoo.org>
Mon, 13 Apr 2020 10:32:14 +0000 (12:32 +0200)
Install corpora/pl196x unconditionally as it is required
for dev-python/nltk-3.5 tests.

Signed-off-by: Michał Górny <mgorny@gentoo.org>
dev-python/nltk-data/nltk-data-20200312-r1.ebuild [new file with mode: 0644]

diff --git a/dev-python/nltk-data/nltk-data-20200312-r1.ebuild b/dev-python/nltk-data/nltk-data-20200312-r1.ebuild
new file mode 100644 (file)
index 0000000..c8f12c5
--- /dev/null
@@ -0,0 +1,184 @@
+# Copyright 2020 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+EAPI=7
+
+inherit check-reqs
+
+DESCRIPTION="Data files for NLTK"
+HOMEPAGE="https://www.nltk.org/nltk_data/"
+
+# at least some of the files have poorly documented licenses
+# TODO: create a USE flag for free-ish subset
+LICENSE="all-rights-reserved"
+SLOT="0"
+KEYWORDS="~amd64 ~x86"
+IUSE="extra"
+RESTRICT="bindist mirror"
+
+BDEPEND="app-arch/unzip"
+
+PACKAGES_ZIP=(
+       # wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=0]' -v @subdir -o "/" -v @id -n - | sort
+       corpora/comtrans
+       corpora/conll2007
+       corpora/jeita
+       corpora/knbc
+       corpora/machado
+       corpora/masc_tagged
+       corpora/nombank.1.0
+       corpora/panlex_swadesh
+       corpora/propbank
+       corpora/reuters
+       corpora/semcor
+       corpora/universal_treebanks_v20
+       sentiment/vader_lexicon
+       stemmers/snowball_data
+)
+
+PACKAGES_UNPACK=(
+       # wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=1]' -v @subdir -o "/" -v @id -n - | sort
+       corpora/abc
+       corpora/alpino
+       corpora/brown
+       corpora/cess_cat
+       corpora/cess_esp
+       corpora/chat80
+       corpora/city_database
+       corpora/cmudict
+       corpora/comparative_sentences
+       corpora/conll2000
+       corpora/conll2002
+       corpora/crubadan
+       corpora/dependency_treebank
+       corpora/dolch
+       corpora/europarl_raw
+       corpora/floresta
+       corpora/framenet_v15
+       corpora/framenet_v17
+       corpora/gazetteers
+       corpora/genesis
+       corpora/gutenberg
+       corpora/ieer
+       corpora/inaugural
+       corpora/indian
+       corpora/lin_thesaurus
+       corpora/mac_morpho
+       corpora/movie_reviews
+       corpora/mte_teip5
+       corpora/names
+       corpora/nonbreaking_prefixes
+       corpora/nps_chat
+       corpora/omw
+       corpora/opinion_lexicon
+       corpora/pl196x
+       corpora/ppattach
+       corpora/product_reviews_1
+       corpora/product_reviews_2
+       corpora/pros_cons
+       corpora/ptb
+       corpora/qc
+       corpora/rte
+       corpora/senseval
+       corpora/sentence_polarity
+       corpora/sentiwordnet
+       corpora/shakespeare
+       corpora/sinica_treebank
+       corpora/state_union
+       corpora/stopwords
+       corpora/subjectivity
+       corpora/swadesh
+       corpora/switchboard
+       corpora/timit
+       corpora/toolbox
+       corpora/treebank
+       corpora/twitter_samples
+       corpora/udhr
+       corpora/udhr2
+       corpora/verbnet
+       corpora/webtext
+       corpora/wordnet
+       corpora/wordnet_ic
+       corpora/words
+       grammars/book_grammars
+       grammars/large_grammars
+       grammars/sample_grammars
+       misc/perluniprops
+       models/bllip_wsj_no_aux
+       models/moses_sample
+       models/wmt15_eval
+       models/word2vec_sample
+       stemmers/porter_test
+       stemmers/rslp
+       taggers/averaged_perceptron_tagger
+       taggers/averaged_perceptron_tagger_ru
+       taggers/universal_tagset
+       tokenizers/punkt
+)
+
+PACKAGES_UNPACK_EXTRA=(
+       chunkers/maxent_ne_chunker
+       corpora/biocreative_ppi
+       corpora/brown_tei
+       corpora/kimmo
+       corpora/paradigms
+       corpora/pe08
+       corpora/pil
+       corpora/problem_reports
+       corpora/smultron
+       corpora/unicode_samples
+       corpora/verbnet3
+       corpora/ycoe
+       grammars/basque_grammars
+       grammars/spanish_grammars
+       help/tagsets
+       misc/mwa_ppdb
+       taggers/maxent_treebank_pos_tagger
+)
+
+add_data() {
+       local x
+       for x; do
+               SRC_URI+="
+                       https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${x}.zip
+                               -> nltk-${x#*/}-${PV}.zip"
+       done
+}
+
+add_data "${PACKAGES_ZIP[@]}" "${PACKAGES_UNPACK[@]}"
+SRC_URI+="
+       extra? ("
+add_data "${PACKAGES_UNPACK_EXTRA[@]}"
+SRC_URI+="
+       )"
+
+CHECKREQS_DISK_USR=3G
+CHECKREQS_DISK_BUILD=${CHECKREQS_DISK_USR}
+
+src_unpack() {
+       local x
+       local to_unpack=( "${PACKAGES_UNPACK[@]}" )
+       use extra && to_unpack+=( "${PACKAGES_UNPACK_EXTRA[@]}" )
+       for x in "${to_unpack[@]}"; do
+               local cat=${x%/*}
+               local pkg=${x#*/}
+
+               mkdir -p "${S}/${cat}" || die
+               cd "${S}/${cat}" || die
+               unpack "nltk-${pkg}-${PV}.zip"
+       done
+}
+
+src_install() {
+       dodir /usr/share/nltk_data
+       mv * "${ED}/usr/share/nltk_data/" || die
+
+       local x
+       for x in "${PACKAGES_ZIP[@]}"; do
+               local cat=${x%/*}
+               local pkg=${x#*/}
+
+               insinto "/usr/share/nltk_data/${cat}"
+               newins "${DISTDIR}/nltk-${pkg}-${PV}.zip" "${pkg}.zip"
+       done
+}