2017-08-29 15:01:17 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
2017-10-10 14:42:09 +02:00
|
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
print(datetime.now())
|
2017-10-16 14:01:38 +02:00
|
|
|
from datetime import datetime
|
2017-10-10 14:42:09 +02:00
|
|
|
|
|
|
|
import time
|
|
|
|
import logging
|
2017-10-16 14:01:38 +02:00
|
|
|
from stop_words import get_stop_words
|
|
|
|
|
|
|
|
#import words as words
|
|
|
|
from nltk.corpus import stopwords as nltk_stopwords
|
|
|
|
from collections import Counter
|
2017-08-29 15:01:17 +02:00
|
|
|
import csv
|
|
|
|
import re
|
2017-09-11 13:00:03 +02:00
|
|
|
import xml.etree.ElementTree as ET
|
2017-08-29 15:01:17 +02:00
|
|
|
import spacy
|
2017-09-11 12:12:28 +02:00
|
|
|
import textacy
|
2017-09-11 13:00:03 +02:00
|
|
|
from scipy import *
|
2017-10-10 14:42:09 +02:00
|
|
|
import sys
|
2017-08-29 15:01:17 +02:00
|
|
|
csv.field_size_limit(sys.maxsize)
|
2017-10-12 15:57:56 +02:00
|
|
|
import pickle
|
2017-10-16 14:01:38 +02:00
|
|
|
import configparser as ConfigParser
|
|
|
|
from miscellaneous import *
|
2017-10-12 15:57:56 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
import time
|
2017-10-12 15:57:56 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
from datetime import datetime
|
|
|
|
import logging
|
|
|
|
from nltk.corpus import stopwords
|
|
|
|
import csv
|
|
|
|
import functools
|
|
|
|
import re
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
import spacy
|
|
|
|
import textacy
|
|
|
|
from scipy import *
|
|
|
|
import sys
|
|
|
|
csv.field_size_limit(sys.maxsize)
|
2017-10-12 15:57:56 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
import time
|
2017-10-12 15:57:56 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
import logging
|
|
|
|
from nltk.corpus import stopwords
|
|
|
|
import csv
|
|
|
|
import functools
|
|
|
|
import re
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
import spacy
|
|
|
|
import textacy
|
|
|
|
from scipy import *
|
|
|
|
import sys
|
|
|
|
csv.field_size_limit(sys.maxsize)
|
2017-10-12 15:57:56 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
import pickle
|
2017-10-10 14:42:09 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
# load config
|
2017-10-10 14:42:09 +02:00
|
|
|
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
|
|
|
|
2017-09-11 12:12:28 +02:00
|
|
|
config = ConfigParser.ConfigParser()
|
2017-10-10 14:42:09 +02:00
|
|
|
with open(config_ini) as f:
|
2017-09-11 12:12:28 +02:00
|
|
|
config.read_file(f)
|
2017-08-31 14:54:01 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
|
|
|
|
|
2017-08-31 14:54:01 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
REGEX_SPECIALCHAR = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
|
|
|
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
2017-08-31 14:54:01 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
THESAURUS = {}
|
|
|
|
WORDS = {}
|
|
|
|
LEMMAS = {}
|
|
|
|
NOUNS = []
|
|
|
|
VORNAMEN= []
|
|
|
|
de_stop_words=[]
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
############# filter tokens
|
2017-08-31 14:54:01 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def keepPOS(pos_list):
|
|
|
|
return lambda tok: tok.pos_ in pos_list
|
2017-08-29 15:01:17 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def keepNouns(noun_list=NOUNS):
|
|
|
|
return lambda tok: tok.lower_ in noun_list
|
2017-08-29 15:01:17 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def removePOS(pos_list):
|
|
|
|
return lambda tok: tok.pos_ not in pos_list
|
2017-08-29 15:01:17 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def removeWords(words, keep=None):
|
2017-09-11 12:12:28 +02:00
|
|
|
if hasattr(keep, '__iter__'):
|
|
|
|
for k in keep:
|
|
|
|
try:
|
|
|
|
words.remove(k)
|
|
|
|
except ValueError:
|
|
|
|
pass
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
return lambda tok: tok.lower_ not in words
|
|
|
|
|
|
|
|
|
|
|
|
def keepENT(ent_list):
|
|
|
|
return lambda tok: tok.ent_type_ in ent_list
|
|
|
|
|
|
|
|
|
|
|
|
def removeENT(ent_list):
|
|
|
|
return lambda tok: tok.ent_type_ not in ent_list
|
|
|
|
|
|
|
|
|
|
|
|
def remove_words_containing_Numbers():
|
|
|
|
return lambda tok: not bool(re.search('\d', tok.lower_))
|
|
|
|
|
|
|
|
|
|
|
|
def remove_words_containing_topLVL():
|
2017-10-16 14:01:38 +02:00
|
|
|
return lambda tok: not bool(re.search(REGEX_TOPLVL, tok.lower_))
|
2017-10-10 14:42:09 +02:00
|
|
|
|
|
|
|
|
|
|
|
def remove_words_containing_specialCharacters():
|
2017-10-16 14:01:38 +02:00
|
|
|
return lambda tok: not bool(re.search(REGEX_SPECIALCHAR, tok.lower_))
|
2017-10-10 14:42:09 +02:00
|
|
|
|
|
|
|
|
|
|
|
def remove_long_words():
|
|
|
|
return lambda tok: not len(tok.lower_) < 2
|
|
|
|
|
|
|
|
|
|
|
|
def remove_short_words():
|
|
|
|
return lambda tok: not len(tok.lower_) > 35
|
|
|
|
|
|
|
|
|
|
|
|
def remove_first_names():
|
|
|
|
return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
|
|
|
|
|
|
|
|
|
|
|
|
############# strings
|
|
|
|
|
|
|
|
def remove_addresses(string):
|
|
|
|
pass # todo
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
|
2017-10-10 14:42:09 +02:00
|
|
|
for i in range(n):
|
|
|
|
try:
|
2017-10-16 14:01:38 +02:00
|
|
|
word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower()
|
2017-10-10 14:42:09 +02:00
|
|
|
except:
|
|
|
|
print(word)
|
|
|
|
return word
|
|
|
|
|
|
|
|
def getFirstSynonym(word, thesaurus=THESAURUS):
|
2017-09-11 12:12:28 +02:00
|
|
|
if not isinstance(word, str):
|
|
|
|
return str(word)
|
2017-08-31 14:54:01 +02:00
|
|
|
|
|
|
|
word = word.lower()
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
if word in thesaurus.keys():
|
|
|
|
return thesaurus[word]
|
|
|
|
else:
|
|
|
|
return str(word)
|
2017-08-31 14:54:01 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
########################## Spellchecking ##########################################
|
|
|
|
# http://norvig.com/spell-correct.html
|
|
|
|
# http://wortschatz.uni-leipzig.de/en/download
|
2017-08-31 14:54:01 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
import re
|
|
|
|
from collections import Counter
|
2017-09-12 14:56:11 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def words(text): return re.findall(r'\w+', text.lower())
|
2017-09-12 14:56:11 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def P(word, N=sum(WORDS.values())):
|
|
|
|
"Probability of `word`."
|
|
|
|
return WORDS[word] / N
|
2017-08-31 14:54:01 +02:00
|
|
|
|
2017-09-11 12:12:28 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def correction(word):
|
|
|
|
"Most probable spelling correction for word."
|
|
|
|
return max(candidates(word), key=P)
|
2017-09-11 12:12:28 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def candidates(word):
|
|
|
|
"Generate possible spelling corrections for word."
|
|
|
|
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
|
2017-09-11 12:12:28 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def known(words):
|
|
|
|
"The subset of `words` that appear in the dictionary of WORDS."
|
|
|
|
return set(w for w in words if w in WORDS)
|
2017-08-31 14:54:01 +02:00
|
|
|
|
2017-09-11 13:24:20 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def edits1(word):
|
|
|
|
"All edits that are one edit away from `word`."
|
|
|
|
letters = 'abcdefghijklmnopqrstuvwxyz'
|
|
|
|
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
|
|
|
|
deletes = [L + R[1:] for L, R in splits if R]
|
|
|
|
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
|
|
|
|
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
|
|
|
inserts = [L + c + R for L, R in splits for c in letters]
|
|
|
|
return set(deletes + transposes + replaces + inserts)
|
2017-08-31 14:54:01 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def edits2(word):
|
|
|
|
"All edits that are two edits away from `word`."
|
|
|
|
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
2017-08-31 14:54:01 +02:00
|
|
|
|
2017-09-11 13:24:20 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
def autocorrectWord(word):
|
|
|
|
try:
|
|
|
|
return correction(word)
|
|
|
|
except:
|
|
|
|
return word
|
2017-08-31 14:54:01 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
############# stringcleaning
|
|
|
|
|
|
|
|
def stringcleaning(stringstream):
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
|
|
|
|
for string in stringstream:
|
|
|
|
string = string.lower()
|
|
|
|
|
|
|
|
# fixUnicode
|
|
|
|
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
|
|
|
|
|
|
|
# remove_words_containing_topLVL
|
2017-10-16 14:01:38 +02:00
|
|
|
string = " ".join([w.lower() for w in string.split() if not re.search(REGEX_TOPLVL, w)])
|
2017-09-11 13:24:20 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
# replaceRockDots
|
|
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
|
|
string = re.sub(r'[ä]', "ae", string)
|
2017-09-11 13:24:20 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
# seperate_words_on_regex:
|
2017-10-16 14:01:38 +02:00
|
|
|
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
|
2017-09-11 13:24:20 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
# cut_after
|
|
|
|
word = "gruss"
|
|
|
|
string = string.rpartition(word)[0] if word in string else string
|
2017-09-11 13:24:20 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
# lemmatize
|
|
|
|
string = " ".join([lemmatizeWord(word) for word in string.split()])
|
2017-09-11 13:24:20 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
# synonyme normalisieren #idee vor oder nach lemmatize?
|
|
|
|
string = " ".join([getFirstSynonym(word) for word in string.split()])
|
2017-09-11 13:24:20 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
# autocorrect
|
|
|
|
string = " ".join([autocorrectWord(word) for word in string.split()])
|
2017-09-11 13:24:20 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
yield string
|
2017-08-31 14:54:01 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def filterTokens(tokens, funclist):
|
|
|
|
# in:tokenlist, funclist
|
|
|
|
# out: tokenlist
|
|
|
|
for f in funclist:
|
|
|
|
tokens = list(filter(f, tokens))
|
2017-08-31 14:54:01 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
return tokens
|
2017-09-11 13:00:03 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def corpus2Text(corpus):
|
|
|
|
for doc in corpus:
|
|
|
|
yield doc.text
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def corpus2Meta(corpus):
|
|
|
|
for doc in corpus:
|
|
|
|
yield doc.metadata
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def processContentstream(textstream, parser, token_filterlist=None):
|
|
|
|
"""
|
|
|
|
:param textstream: string-gen
|
|
|
|
:param funclist: [func]
|
|
|
|
:param parser: spacy-parser
|
|
|
|
:return: string-gen
|
2017-10-10 14:42:09 +02:00
|
|
|
"""
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
# pre_parse
|
|
|
|
textstream = stringcleaning(textstream)
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
pipe = parser.pipe(textstream)
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
tokens = []
|
|
|
|
for doc in pipe:
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
tokens = [tok for tok in doc]
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
# in_parse
|
|
|
|
if token_filterlist is not None:
|
|
|
|
tokens = filterTokens(tokens, token_filterlist)
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
yield " ".join([tok.lower_ for tok in tokens])
|
|
|
|
# yield " ".join(list(set([tok.lower_ for tok in tokens])))
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def processDictstream(dictstream, funcdict, parser):
|
2017-10-10 14:42:09 +02:00
|
|
|
"""
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
:param dictstream: dict-gen
|
|
|
|
:param funcdict:
|
|
|
|
clean_in_meta = {
|
|
|
|
"Solution":funclist,
|
|
|
|
...
|
|
|
|
}
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
:param parser: spacy-parser
|
|
|
|
:return: dict-gen
|
|
|
|
"""
|
|
|
|
for dic in dictstream:
|
|
|
|
result = {}
|
|
|
|
for key, value in dic.items():
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
if key in funcdict:
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
doc = parser(value)
|
|
|
|
tokens = [tok for tok in doc]
|
|
|
|
funclist = funcdict[key]
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
tokens = filterTokens(tokens, funclist)
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
result[key] = " ".join([tok.lower_ for tok in tokens])
|
2017-09-11 13:00:03 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
else:
|
|
|
|
result[key] = value
|
|
|
|
yield result
|
2017-09-11 13:00:03 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
##################################################################################################
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-12 15:57:56 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
2017-10-12 15:57:56 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
path2wordsdict = config.get("spellchecking", "pickle_file")
|
2017-10-12 15:57:56 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
path2lemmadict = config.get("lemmatization","pickle_file")
|
2017-10-12 15:57:56 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
path2nouns_list = config.get("nouns","pickle_file")
|
2017-10-12 15:57:56 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
path2firstnameslist = config.get("firstnames","pickle_file")
|
2017-10-12 15:57:56 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
2017-10-12 15:57:56 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
corpus_de_path = config.get("de_corpus", "path")
|
|
|
|
raw_de_name = config.get("de_corpus", "raw")
|
|
|
|
pre_de_name = config.get("de_corpus", "pre")
|
2017-10-12 15:57:56 +02:00
|
|
|
|
|
|
|
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
corpus_en_path = config.get("en_corpus", "path")
|
|
|
|
raw_en_name = config.get("en_corpus", "raw")
|
|
|
|
pre_en_name = config.get("en_corpus", "pre")
|
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
|
|
|
"hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen",
|
|
|
|
"versuchen", "unbestimmt", "woche", "tadelos", "klappen", "mittlerweile", "bekommen",
|
|
|
|
"erreichbar", "gruss", "auffahren", "vorgang", "hinweis", "institut", "universitaet",
|
|
|
|
"name", "gruss", "id", "erfolg", "mail","folge",
|
|
|
|
"nummer", "team", "fakultaet", "email", "absender", "tu", "versenden", "vorname", "message",
|
|
|
|
"service", "strasse", "prozess", "portal", "raum", "personal", "moeglichkeit", "fremd", "wende",
|
|
|
|
"rueckfrage", "stehen", "verfuegung",
|
|
|
|
"funktionieren", "kollege", "pruefen", "hoffen"
|
|
|
|
]
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
filter_tokens = [
|
|
|
|
# removeENT(["PERSON"]),
|
|
|
|
# idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
keepNouns(),
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
remove_words_containing_Numbers(),
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
removePOS(["PUNCT", "SPACE", "NUM"]),
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
#removeWords(de_stop_words + custom_words),
|
|
|
|
removeWords(de_stop_words),
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
remove_long_words(),
|
|
|
|
remove_short_words(),
|
|
|
|
remove_first_names()
|
2017-09-11 13:00:03 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
]
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
clean_in_meta = {
|
|
|
|
"Solution": [removePOS(["SPACE"])],
|
|
|
|
"Subject": [removePOS(["SPACE", "PUNCT"])],
|
|
|
|
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
|
|
|
}
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
start = time.time()
|
|
|
|
printlog("Preprocessing: {0}".format(datetime.now()))
|
|
|
|
|
|
|
|
|
|
|
|
THESAURUS = load_obj(path2thesaurus_dict)
|
|
|
|
WORDS = load_obj(path2wordsdict)
|
|
|
|
LEMMAS = load_obj(path2lemmadict)
|
|
|
|
DE_STOP_WORDS = load_obj(path2stopwordlist)
|
|
|
|
NOUNS = load_obj(path2nouns_list)
|
|
|
|
VORNAMEN = load_obj(path2firstnameslist)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#load raw corpus and create new one
|
|
|
|
raw_de_corpus, DE_PARSER = load_corpus(corpus_name=raw_de_name, corpus_path=corpus_de_path)
|
|
|
|
raw_en_corpus, EN_PARSER = load_corpus(corpus_name=raw_en_name, corpus_path=corpus_en_path)
|
|
|
|
|
|
|
|
de_corpus = textacy.Corpus(DE_PARSER)
|
|
|
|
en_corpus = textacy.Corpus(EN_PARSER)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## process and add files to textacy-corpi,
|
|
|
|
printlog("Preprocess and add texts to textacy-corpi")
|
|
|
|
de_corpus.add_texts(
|
|
|
|
processContentstream(corpus2Text(raw_de_corpus), token_filterlist=filter_tokens, parser=DE_PARSER),
|
|
|
|
processDictstream(corpus2Meta(raw_de_corpus), clean_in_meta,parser=raw_de_corpus.lang)
|
|
|
|
)
|
|
|
|
en_corpus.add_texts(
|
|
|
|
processContentstream(corpus2Text(raw_en_corpus), token_filterlist=filter_tokens, parser=EN_PARSER),
|
|
|
|
processDictstream(corpus2Meta(raw_en_corpus), clean_in_meta,parser=raw_en_corpus.lang)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# leere docs aus corpi kicken
|
|
|
|
de_corpus.remove(lambda doc: len(doc) == 0)
|
|
|
|
en_corpus.remove(lambda doc: len(doc) == 0)
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(20):
|
|
|
|
printRandomDoc(de_corpus)
|
|
|
|
#printRandomDoc(en_corpus)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#save corpi
|
|
|
|
save_corpus(corpus=de_corpus, corpus_path=corpus_de_path, corpus_name=pre_de_name)
|
|
|
|
save_corpus(corpus=en_corpus, corpus_path=corpus_en_path, corpus_name=pre_en_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
end = time.time()
|
|
|
|
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
"""
|
|
|
|
pipe=[
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
##String
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
fixUnicode(),
|
|
|
|
replaceHardS(),
|
|
|
|
resolveAbbrivations(),
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
remove_words_containing_topLVL(),
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
replaceSpecialChars(" "), (mit Leerzeichen erstzen, dadruch werden Terme wie 8203;verfügung getrennt
|
2017-09-11 13:00:03 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
remove_words_containing_Numbers(),
|
2017-08-31 14:54:01 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
##spacyParse
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
removeENT("PERSON"),
|
|
|
|
keepPOS(["NOUN"]),
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
#ODER
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
lemmatize(),
|
|
|
|
removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),
|
2017-08-29 15:01:17 +02:00
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
# evtl.
|
|
|
|
spellCorrection(),
|
|
|
|
keepUniqeTokens(),
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
]
|
2017-08-29 15:01:17 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-10 14:42:09 +02:00
|
|
|
"""
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
"""
|
|
|
|
filter_tokens=[
|
|
|
|
#removeENT(["PERSON"]),
|
|
|
|
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
|
|
|
#idee rechtschreibkorrektur --> PyEnchant
|
|
|
|
#idee thesaurus --> WordNet, eigener
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
remove_words_containing_Numbers(),
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
removePOS(["PUNCT","SPACE","NUM"]),
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
removeWords(de_stop_words+custom_words),
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
remove_long_words(),
|
|
|
|
remove_short_words(),
|
|
|
|
remove_first_names(),
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
keepPOS(["NOUN"]),
|
2017-08-29 15:01:17 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
]
|
|
|
|
"""
|