lexicabble-download/scripts/download.sh

27 lines
505 B
Bash

# Variables
DICT=Diccionario.jar
URL=http://www.redeletras.com/diccionario/$DICT
TMP=extracted
LIST=lexicon.txt
# Goes to repo root
cd $(dirname $0)/..
# Downloads dictionary
wget -q --show-progress -O $DICT $URL && echo "Downloaded $DICT" || rm $DICT
# Extracts dictionary
unzip -q -d $TMP $DICT
# Extracts list
for txt in $TMP/*.txt; do
iconv -f ISO-8859-1 -t UTF-8 $txt > $txt.utf8
cat $txt.utf8 >> $LIST.tmp
done
# Sorts and uniqs
cat $LIST.tmp | sort > $LIST
# Cleans
rm -rf $TMP $LIST.tmp