251 lines
5.6 KiB
Python
251 lines
5.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import os
|
|
import time
|
|
from datetime import datetime
|
|
import textacy
|
|
from scipy import *
|
|
from miscellaneous import *
|
|
|
|
|
|
|
|
csv.field_size_limit(sys.maxsize)
|
|
|
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
|
|
|
# load config
|
|
config_ini = FILEPATH + "config.ini"
|
|
|
|
config = ConfigParser.ConfigParser()
|
|
with open(config_ini) as f:
|
|
config.read_file(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
############# stringcleaning
|
|
|
|
|
|
def clean(stringstream):#, NOUNS):
|
|
"""
|
|
fix bad unicode
|
|
seperate_words_on_regex `\=~%^&*()_+\[\]{};\'"|</>
|
|
normalize whitespace
|
|
remove linebreaks
|
|
replaceRockDöts
|
|
|
|
:param stringstream: str-gen
|
|
:return: string-gen
|
|
"""
|
|
|
|
#NOUNS = [n.lower() for n in NOUNS]
|
|
|
|
for string in stringstream:
|
|
# fixUnicode
|
|
string = textacy.preprocess.fix_bad_unicode(string)
|
|
#string = textacy.preprocess.unidecode(string)
|
|
|
|
# seperate_words_on_regex:
|
|
string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|</>]').split(string)) #todo bla vllt lassen wir das hier? achaj: für header und footer vllt englische-spracherkennung und adressen parsing und grußfromelerkennung
|
|
|
|
#normalize whitespace
|
|
string = textacy.preprocess.normalize_whitespace(string)
|
|
|
|
#todo bla textacy.preprocess.remove_accents(text, method=u'unicode')[source]
|
|
|
|
|
|
|
|
|
|
#remove linebreaks
|
|
string = re.sub(r'[\n]', " ", string) #todo bla kann/soll raus? weil absätze vllt weas zu bedeuten haben
|
|
|
|
string = replaceRockDots(string) #todo bla gehört zu normalize
|
|
|
|
"""
|
|
# fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden.
|
|
|
|
|
|
|
|
#for n in nouns:
|
|
# string = string.replace(n.lower(),n)
|
|
#string = multisub(nouns_tuples,string)
|
|
|
|
#https://stackoverflow.com/questions/10968558/python-re-sub-with-a-list-of-words-to-find
|
|
#string = re.sub(r'[\n]', " ", string)
|
|
#string = string.replace(noun,noun.title()) for noun in nouns
|
|
|
|
splitted = string.split()
|
|
for i,s in enumerate(splitted):
|
|
|
|
if s in NOUNS:
|
|
splitted[i] = s.title()
|
|
if i != 0:
|
|
for punct in ":.!?":
|
|
if punct in splitted[i - 1]:
|
|
splitted[i] = s.title()
|
|
|
|
|
|
string = " ".join(splitted)
|
|
"""
|
|
|
|
|
|
|
|
yield string
|
|
|
|
|
|
|
|
def processDictstream_v2(dictstream, keys_to_clean):
|
|
for dic in dictstream:
|
|
|
|
result = {k: re.sub(r'[.!?]', "", normalize_str(v).lower()) if k in keys_to_clean else v for k, v in dic.items()}
|
|
yield result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def processDictstream(dictstream, funcdict, parser):
|
|
"""
|
|
|
|
:param dictstream: dict-gen
|
|
:param funcdict:
|
|
clean_in_meta = {
|
|
"Solution":funclist,
|
|
...
|
|
}
|
|
|
|
:param parser: spacy-parser
|
|
:return: dict-gen
|
|
"""
|
|
for dic in dictstream:
|
|
result = {}
|
|
for key, value in dic.items():
|
|
|
|
if key in funcdict:
|
|
|
|
doc = parser(value)
|
|
tokens = [tok for tok in doc]
|
|
funclist = funcdict[key]
|
|
|
|
tokens = filterTokens(tokens, funclist)
|
|
|
|
result[key] = " ".join([tok.lower_ for tok in tokens])
|
|
|
|
|
|
else:
|
|
result[key] = value
|
|
yield result
|
|
|
|
def filterTokens(tokens, funclist):
|
|
# in:tokenlist, funclist
|
|
# out: tokenlist
|
|
for f in funclist:
|
|
tokens = list(filter(f, tokens))
|
|
|
|
for tok in tokens:
|
|
if tok.pos_ == "NOUN":
|
|
x = 0
|
|
|
|
return tokens
|
|
|
|
|
|
##################################################################################################
|
|
|
|
|
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
|
|
|
|
|
def cleanCorpus(corpus,clean_in_meta):
|
|
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
|
|
|
|
"""
|
|
ressources_path = FILEPATH + "ressources/"
|
|
|
|
|
|
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
|
|
|
|
|
|
#NOUNS = load_obj(path2nouns_list)
|
|
#noun_disjunction = '|'.join(NOUNS)
|
|
#nouns_tuples = []
|
|
#for n in NOUNS:
|
|
# nouns_tuples.append((n.lower(),n))
|
|
"""
|
|
|
|
|
|
# load Corpus
|
|
raw_corpus = corpus
|
|
parser = corpus.spacy_lang
|
|
|
|
|
|
# Actually clean the corpus
|
|
cleaned_corpus = textacy.Corpus(parser)
|
|
|
|
cleaned_corpus.add_texts(
|
|
clean(corpus2Text(raw_corpus)),
|
|
#processDictstream(corpus2Meta(cleaned_corpus), clean_in_meta, parser=parser)
|
|
|
|
processDictstream_v2(corpus2Meta(raw_corpus),clean_in_meta)
|
|
)
|
|
|
|
|
|
# leere docs aus corpus kicken
|
|
cleaned_corpus.remove(lambda doc: len(doc) == 0)
|
|
|
|
|
|
|
|
|
|
#save corpus
|
|
cleanCorpus_name = corpus.lang + "_clean"
|
|
save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name)
|
|
|
|
|
|
|
|
return cleaned_corpus
|
|
|
|
|
|
def removePOS(pos_list):
|
|
return lambda tok: tok.pos_ not in pos_list
|
|
|
|
|
|
|
|
def main(corpus):
|
|
start = time.time()
|
|
|
|
|
|
|
|
clean_in_meta = {
|
|
"Solution": [removePOS(["SPACE"])],
|
|
"Subject": [removePOS(["SPACE", "PUNCT"])],
|
|
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
|
}
|
|
|
|
clean_in_meta = ["Subject", "categoryName" ]
|
|
|
|
|
|
cleaned_corpus = cleanCorpus(corpus, clean_in_meta)
|
|
|
|
|
|
|
|
|
|
|
|
end = time.time()
|
|
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))
|
|
return cleaned_corpus
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/",
|
|
corpus_name="de_raw")
|
|
|
|
main(corpus)
|
|
|
|
|
|
|
|
|