topicModelingTickets/preprocessing.py

546 lines
13 KiB
Python
Raw Normal View History

2017-08-29 15:01:17 +02:00
# -*- coding: utf-8 -*-
2017-10-10 14:42:09 +02:00
from datetime import datetime
2017-08-29 15:01:17 +02:00
import csv
2017-10-10 14:42:09 +02:00
import sys
2017-10-16 14:01:38 +02:00
from miscellaneous import *
from datetime import datetime
import time
import textacy
from scipy import *
2017-10-10 14:42:09 +02:00
2017-10-17 10:13:49 +02:00
import os
2017-10-10 14:42:09 +02:00
2017-10-17 10:13:49 +02:00
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
2017-10-10 14:42:09 +02:00
2017-10-16 14:01:38 +02:00
# load config
2017-10-17 10:13:49 +02:00
config_ini = FILEPATH + "config.ini"
2017-10-10 14:42:09 +02:00
config = ConfigParser.ConfigParser()
2017-10-10 14:42:09 +02:00
with open(config_ini) as f:
config.read_file(f)
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
global REGEX_SPECIALCHAR
global REGEX_TOPLVL
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
global THESAURUS
global WORDS
global LEMMAS
global NOUNS
global VORNAMEN
global DE_STOP_WORDS
global EN_STOP_WORDS
2017-10-25 09:46:44 +02:00
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
2017-10-16 14:01:38 +02:00
THESAURUS = {}
2017-10-18 17:37:20 +02:00
WORDS= {}
LEMMAS= {}
NOUNS= {}
VORNAMEN= {}
DE_STOP_WORDS= {}
EN_STOP_WORDS= {}
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
############# filter tokens
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
def keepPOS(pos_list):
return lambda tok: tok.pos_ in pos_list
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def keepNouns(noun_list=NOUNS):
return lambda tok: tok.lower_ in noun_list
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def removePOS(pos_list):
return lambda tok: tok.pos_ not in pos_list
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def removeWords(words, keep=None):
if hasattr(keep, '__iter__'):
for k in keep:
try:
words.remove(k)
except ValueError:
pass
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
return lambda tok: tok.lower_ not in words
def keepENT(ent_list):
return lambda tok: tok.ent_type_ in ent_list
def removeENT(ent_list):
return lambda tok: tok.ent_type_ not in ent_list
def remove_words_containing_Numbers():
return lambda tok: not bool(re.search('\d', tok.lower_))
def remove_words_containing_topLVL():
2017-10-16 14:01:38 +02:00
return lambda tok: not bool(re.search(REGEX_TOPLVL, tok.lower_))
2017-10-10 14:42:09 +02:00
def remove_words_containing_specialCharacters():
2017-10-16 14:01:38 +02:00
return lambda tok: not bool(re.search(REGEX_SPECIALCHAR, tok.lower_))
2017-10-10 14:42:09 +02:00
def remove_long_words():
return lambda tok: not len(tok.lower_) < 2
def remove_short_words():
return lambda tok: not len(tok.lower_) > 35
def remove_first_names():
return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
############# strings
def remove_addresses(string):
pass # todo
2017-10-16 14:01:38 +02:00
def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
2017-10-10 14:42:09 +02:00
for i in range(n):
try:
2017-10-16 14:01:38 +02:00
word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower()
2017-10-10 14:42:09 +02:00
except:
print(word)
return word
def getFirstSynonym(word, thesaurus=THESAURUS):
if not isinstance(word, str):
return str(word)
2017-08-31 14:54:01 +02:00
word = word.lower()
2017-10-16 14:01:38 +02:00
if word in thesaurus.keys():
return thesaurus[word]
else:
return str(word)
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
########################## Spellchecking ##########################################
# http://norvig.com/spell-correct.html
# http://wortschatz.uni-leipzig.de/en/download
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
import re
2017-09-12 14:56:11 +02:00
2017-10-10 14:42:09 +02:00
def words(text): return re.findall(r'\w+', text.lower())
2017-09-12 14:56:11 +02:00
2017-10-10 14:42:09 +02:00
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return WORDS[word] / N
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
2017-10-10 14:42:09 +02:00
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
2017-10-10 14:42:09 +02:00
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
2017-08-31 14:54:01 +02:00
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
2017-08-31 14:54:01 +02:00
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
def autocorrectWord(word):
try:
return correction(word)
except:
return word
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
############# stringcleaning
2017-10-25 09:46:44 +02:00
@deprecated
2017-10-10 14:42:09 +02:00
def stringcleaning(stringstream):
2017-10-16 14:01:38 +02:00
2017-10-10 14:42:09 +02:00
for string in stringstream:
string = string.lower()
# fixUnicode
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
# remove_words_containing_topLVL
2017-10-16 14:01:38 +02:00
string = " ".join([w.lower() for w in string.split() if not re.search(REGEX_TOPLVL, w)])
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
# seperate_words_on_regex:
2017-10-16 14:01:38 +02:00
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
# cut_after
2017-10-17 10:13:49 +02:00
word = "gruss" #idee addressen enfernen --> postal.parser
2017-10-10 14:42:09 +02:00
string = string.rpartition(word)[0] if word in string else string
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
# lemmatize
string = " ".join([lemmatizeWord(word) for word in string.split()])
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
# synonyme normalisieren #idee vor oder nach lemmatize?
string = " ".join([getFirstSynonym(word) for word in string.split()])
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
# autocorrect
string = " ".join([autocorrectWord(word) for word in string.split()])
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
yield string
2017-08-31 14:54:01 +02:00
2017-10-18 17:37:20 +02:00
2017-10-16 14:01:38 +02:00
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
2017-08-31 14:54:01 +02:00
2017-10-16 14:01:38 +02:00
return tokens
2017-10-18 17:37:20 +02:00
def processContentstream2(textstream, parser, token_filterlist=None):
#pre parse
textstream = preparse(textstream)
pipe = parser.pipe(textstream)
for doc in pipe:
tokens = [tok for tok in doc]
# in parse
if token_filterlist is not None:
tokens = filterTokens(tokens, token_filterlist)
# post parse
tokens = [postparse(tok) for tok in tokens] #todo informationsverlust!
yield " ".join(tokens)
def preparse(stringstream):
for string in stringstream:
# cut_after
# todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen
words = ["gruss", "grusse","gruesse","gruessen","grusses"]
for gr in words:
if gr in string:
string = string.rpartition(gr)[0]
break
yield string
def postparse(toktext):
"""
:param toktext: spacy.token
:return: string
"""
toktext = toktext.lower_
# remove_words_containing_topLVL
toktext = toktext if not re.search(REGEX_TOPLVL, toktext) else ""
2017-10-18 17:37:20 +02:00
# lemmatize
toktext = lemmatizeWord(toktext)
2017-10-18 17:37:20 +02:00
# synonyme normalisieren
toktext = getFirstSynonym(toktext)
2017-10-18 17:37:20 +02:00
# autocorrect
toktext = autocorrectWord(toktext)
return toktext
2017-10-16 14:01:38 +02:00
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
2017-10-16 14:01:38 +02:00
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
2017-10-25 09:46:44 +02:00
@deprecated
2017-10-16 14:01:38 +02:00
def processContentstream(textstream, parser, token_filterlist=None):
"""
:param textstream: string-gen
:param funclist: [func]
:param parser: spacy-parser
:return: string-gen
2017-10-10 14:42:09 +02:00
"""
2017-10-10 14:42:09 +02:00
# pre_parse
textstream = stringcleaning(textstream)
2017-10-10 14:42:09 +02:00
pipe = parser.pipe(textstream)
2017-10-10 14:42:09 +02:00
tokens = []
for doc in pipe:
2017-10-10 14:42:09 +02:00
tokens = [tok for tok in doc]
2017-10-10 14:42:09 +02:00
# in_parse
if token_filterlist is not None:
tokens = filterTokens(tokens, token_filterlist)
2017-10-10 14:42:09 +02:00
yield " ".join([tok.lower_ for tok in tokens])
# yield " ".join(list(set([tok.lower_ for tok in tokens])))
2017-10-16 14:01:38 +02:00
def processDictstream(dictstream, funcdict, parser):
2017-10-10 14:42:09 +02:00
"""
2017-10-10 14:42:09 +02:00
:param dictstream: dict-gen
:param funcdict:
clean_in_meta = {
"Solution":funclist,
...
}
2017-10-10 14:42:09 +02:00
:param parser: spacy-parser
:return: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
2017-10-10 14:42:09 +02:00
if key in funcdict:
2017-10-10 14:42:09 +02:00
doc = parser(value)
tokens = [tok for tok in doc]
funclist = funcdict[key]
2017-10-10 14:42:09 +02:00
tokens = filterTokens(tokens, funclist)
2017-10-10 14:42:09 +02:00
result[key] = " ".join([tok.lower_ for tok in tokens])
2017-10-10 14:42:09 +02:00
else:
result[key] = value
yield result
2017-10-16 14:01:38 +02:00
##################################################################################################
2017-10-16 14:01:38 +02:00
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
2017-10-17 10:13:49 +02:00
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
2017-10-17 10:13:49 +02:00
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
2017-10-17 10:13:49 +02:00
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
2017-10-17 10:13:49 +02:00
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
2017-10-17 10:13:49 +02:00
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
2017-10-18 17:37:20 +02:00
path2DEstopwordlist = FILEPATH + config.get("de_stopwords", "pickle_file")
2017-10-18 17:37:20 +02:00
path2ENstopwordlist = FILEPATH + config.get("en_stopwords", "pickle_file")
2017-10-17 10:13:49 +02:00
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
2017-10-17 10:13:49 +02:00
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
2017-10-16 14:01:38 +02:00
2017-10-25 09:46:44 +02:00
logprint("Preprocess {0}_corpus at {1}".format(lang, datetime.now()))
2017-10-16 14:01:38 +02:00
2017-10-25 09:46:44 +02:00
cleanCorpus_name = lang + "_clean_ticket"
2017-10-17 10:13:49 +02:00
preCorpus_name = lang + "_pre_ticket"
2017-10-16 14:01:38 +02:00
2017-10-25 09:46:44 +02:00
logprint("Load {0}_raw".format(lang))
2017-10-17 10:13:49 +02:00
#load raw corpus and create new one
2017-10-25 09:46:44 +02:00
clean_corpus, parser = load_corpus(corpus_name=cleanCorpus_name, corpus_path=corpus_path)
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
corpus = textacy.Corpus(parser)
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
## process and add files to textacy-corpi,
corpus.add_texts(
2017-10-25 09:46:44 +02:00
processContentstream2(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser)
2017-10-17 10:13:49 +02:00
)
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
# leere docs aus corpi kicken
corpus.remove(lambda doc: len(doc) == 0)
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
for i in range(printrandom):
printRandomDoc(corpus)
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
#save corpus
save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name)
2017-10-16 14:01:38 +02:00
2017-10-25 09:46:44 +02:00
#save corpus as labled, plain text
plainpath = FILEPATH + config.get("de_corpus", "path") + "labled_lines.txt"
textacy.fileio.write_file_lines(labledCorpiLines(corpus),filepath=plainpath )
return corpus
def labledCorpiLines(corpus):
for doc in corpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
def main():
start = time.time()
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
THESAURUS = load_obj(path2thesaurus_dict)
WORDS = load_obj(path2wordsdict)
LEMMAS = load_obj(path2lemmadict)
2017-10-18 17:37:20 +02:00
DE_STOP_WORDS = load_obj(path2DEstopwordlist)
EN_STOP_WORDS = load_obj(path2ENstopwordlist)
2017-10-17 10:13:49 +02:00
NOUNS = load_obj(path2nouns_list)
VORNAMEN = load_obj(path2firstnameslist)
2017-10-16 14:01:38 +02:00
2017-10-18 17:37:20 +02:00
filter_tokens = [
# removeENT(["PERSON"]),
keepNouns(NOUNS),
2017-10-16 14:01:38 +02:00
2017-10-18 17:37:20 +02:00
remove_words_containing_Numbers(),
removePOS(["PUNCT", "SPACE", "NUM"]),
# removeWords(de_stop_words + custom_words),
removeWords(DE_STOP_WORDS),
remove_long_words(),
remove_short_words(),
remove_first_names()
]
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
"Subject": [removePOS(["SPACE", "PUNCT"])],
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
2017-10-25 09:46:44 +02:00
corpus = preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de",printrandom=5)
#from topicModeling import jgibbsLLDA
#jgibbsLLDA(corpus)
2017-10-16 14:01:38 +02:00
2017-10-18 17:37:20 +02:00
#preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
2017-10-16 14:01:38 +02:00
end = time.time()
2017-10-25 09:46:44 +02:00
logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
2017-10-16 14:01:38 +02:00
if __name__ == "__main__":
main()
2017-10-10 14:42:09 +02:00
"""
pipe=[
2017-10-10 14:42:09 +02:00
##String
2017-10-10 14:42:09 +02:00
fixUnicode(),
replaceHardS(),
resolveAbbrivations(),
2017-10-10 14:42:09 +02:00
remove_words_containing_topLVL(),
2017-10-10 14:42:09 +02:00
replaceSpecialChars(" "), (mit Leerzeichen erstzen, dadruch werden Terme wie 8203;verfügung getrennt
2017-10-10 14:42:09 +02:00
remove_words_containing_Numbers(),
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
##spacyParse
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
removeENT("PERSON"),
keepPOS(["NOUN"]),
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
#ODER
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
lemmatize(),
removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
# evtl.
spellCorrection(),
keepUniqeTokens(),
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
]
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
"""
2017-10-16 14:01:38 +02:00
"""
filter_tokens=[
#removeENT(["PERSON"]),
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
#idee rechtschreibkorrektur --> PyEnchant
#idee thesaurus --> WordNet, eigener
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
remove_words_containing_Numbers(),
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
removePOS(["PUNCT","SPACE","NUM"]),
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
removeWords(de_stop_words+custom_words),
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
remove_long_words(),
remove_short_words(),
remove_first_names(),
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
keepPOS(["NOUN"]),
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
]
"""