topicModelingTickets/cleaning.py

251 lines
5.6 KiB
Python
Raw Permalink Normal View History

2017-10-25 09:46:44 +02:00
# -*- coding: utf-8 -*-
2017-12-08 11:06:07 +01:00
import os
2017-10-25 09:46:44 +02:00
import time
2017-12-08 11:06:07 +01:00
from datetime import datetime
2017-10-25 09:46:44 +02:00
import textacy
from scipy import *
2017-12-08 11:06:07 +01:00
from miscellaneous import *
2017-10-25 09:46:44 +02:00
2017-11-06 12:54:59 +01:00
2017-10-25 09:46:44 +02:00
csv.field_size_limit(sys.maxsize)
2017-12-08 11:06:07 +01:00
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
2017-10-25 09:46:44 +02:00
# load config
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
############# stringcleaning
2017-12-08 11:06:07 +01:00
def clean(stringstream):#, NOUNS):
"""
fix bad unicode
seperate_words_on_regex `\=~%^&*()_+\[\]{};\'"|</>
normalize whitespace
remove linebreaks
replaceRockDöts
:param stringstream: str-gen
:return: string-gen
"""
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
#NOUNS = [n.lower() for n in NOUNS]
2017-10-25 09:46:44 +02:00
for string in stringstream:
# fixUnicode
2017-12-08 11:06:07 +01:00
string = textacy.preprocess.fix_bad_unicode(string)
#string = textacy.preprocess.unidecode(string)
2017-10-25 09:46:44 +02:00
# seperate_words_on_regex:
string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|</>]').split(string)) #todo bla vllt lassen wir das hier? achaj: für header und footer vllt englische-spracherkennung und adressen parsing und grußfromelerkennung
2017-10-25 09:46:44 +02:00
#normalize whitespace
string = textacy.preprocess.normalize_whitespace(string)
#todo bla textacy.preprocess.remove_accents(text, method=u'unicode')[source]
2017-10-25 09:46:44 +02:00
#remove linebreaks
string = re.sub(r'[\n]', " ", string) #todo bla kann/soll raus? weil absätze vllt weas zu bedeuten haben
2017-10-25 09:46:44 +02:00
string = replaceRockDots(string) #todo bla gehört zu normalize
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
"""
# fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden.
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
#for n in nouns:
# string = string.replace(n.lower(),n)
#string = multisub(nouns_tuples,string)
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
#https://stackoverflow.com/questions/10968558/python-re-sub-with-a-list-of-words-to-find
#string = re.sub(r'[\n]', " ", string)
#string = string.replace(noun,noun.title()) for noun in nouns
splitted = string.split()
for i,s in enumerate(splitted):
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
if s in NOUNS:
splitted[i] = s.title()
if i != 0:
for punct in ":.!?":
if punct in splitted[i - 1]:
splitted[i] = s.title()
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
string = " ".join(splitted)
"""
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
yield string
2017-10-25 09:46:44 +02:00
def processDictstream_v2(dictstream, keys_to_clean):
for dic in dictstream:
result = {k: re.sub(r'[.!?]', "", normalize_str(v).lower()) if k in keys_to_clean else v for k, v in dic.items()}
yield result
def processDictstream(dictstream, funcdict, parser):
"""
:param dictstream: dict-gen
:param funcdict:
clean_in_meta = {
"Solution":funclist,
...
}
:param parser: spacy-parser
:return: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
if key in funcdict:
doc = parser(value)
tokens = [tok for tok in doc]
funclist = funcdict[key]
tokens = filterTokens(tokens, funclist)
result[key] = " ".join([tok.lower_ for tok in tokens])
else:
result[key] = value
yield result
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
for tok in tokens:
if tok.pos_ == "NOUN":
x = 0
return tokens
2017-10-25 09:46:44 +02:00
##################################################################################################
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
def cleanCorpus(corpus,clean_in_meta):
2017-12-08 11:06:07 +01:00
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
2017-10-25 09:46:44 +02:00
"""
2017-12-08 11:06:07 +01:00
ressources_path = FILEPATH + "ressources/"
2017-12-08 11:06:07 +01:00
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
2017-12-08 11:06:07 +01:00
#NOUNS = load_obj(path2nouns_list)
#noun_disjunction = '|'.join(NOUNS)
#nouns_tuples = []
#for n in NOUNS:
# nouns_tuples.append((n.lower(),n))
"""
2017-10-25 09:46:44 +02:00
# load Corpus
2017-12-08 11:06:07 +01:00
raw_corpus = corpus
parser = corpus.spacy_lang
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
# Actually clean the corpus
cleaned_corpus = textacy.Corpus(parser)
2017-12-08 11:06:07 +01:00
cleaned_corpus.add_texts(
clean(corpus2Text(raw_corpus)),
#processDictstream(corpus2Meta(cleaned_corpus), clean_in_meta, parser=parser)
processDictstream_v2(corpus2Meta(raw_corpus),clean_in_meta)
2017-10-25 09:46:44 +02:00
)
# leere docs aus corpus kicken
2017-12-08 11:06:07 +01:00
cleaned_corpus.remove(lambda doc: len(doc) == 0)
2017-10-25 09:46:44 +02:00
#save corpus
cleanCorpus_name = corpus.lang + "_clean"
2017-12-08 11:06:07 +01:00
save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name)
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
return cleaned_corpus
2017-10-25 09:46:44 +02:00
def removePOS(pos_list):
return lambda tok: tok.pos_ not in pos_list
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
def main(corpus):
2017-10-25 09:46:44 +02:00
start = time.time()
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
"Subject": [removePOS(["SPACE", "PUNCT"])],
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
clean_in_meta = ["Subject", "categoryName" ]
2017-10-25 09:46:44 +02:00
cleaned_corpus = cleanCorpus(corpus, clean_in_meta)
2017-12-08 11:06:07 +01:00
2017-10-25 09:46:44 +02:00
end = time.time()
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))
2017-12-08 11:06:07 +01:00
return cleaned_corpus
2017-10-25 09:46:44 +02:00
if __name__ == "__main__":
2017-12-08 11:06:07 +01:00
corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/",
corpus_name="de_raw")
main(corpus)
2017-10-25 09:46:44 +02:00