# -*- coding: utf-8 -*- import os import time from datetime import datetime import textacy from scipy import * from miscellaneous import * csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) ############# stringcleaning def clean(stringstream):#, NOUNS): """ fix bad unicode seperate_words_on_regex `\=~%^&*()_+\[\]{};\'"| normalize whitespace remove linebreaks replaceRockDöts :param stringstream: str-gen :return: string-gen """ #NOUNS = [n.lower() for n in NOUNS] for string in stringstream: # fixUnicode string = textacy.preprocess.fix_bad_unicode(string) #string = textacy.preprocess.unidecode(string) # seperate_words_on_regex: string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|]').split(string)) #todo bla vllt lassen wir das hier? achaj: für header und footer vllt englische-spracherkennung und adressen parsing und grußfromelerkennung #normalize whitespace string = textacy.preprocess.normalize_whitespace(string) #todo bla textacy.preprocess.remove_accents(text, method=u'unicode')[source] #remove linebreaks string = re.sub(r'[\n]', " ", string) #todo bla kann/soll raus? weil absätze vllt weas zu bedeuten haben string = replaceRockDots(string) #todo bla gehört zu normalize """ # fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden. #for n in nouns: # string = string.replace(n.lower(),n) #string = multisub(nouns_tuples,string) #https://stackoverflow.com/questions/10968558/python-re-sub-with-a-list-of-words-to-find #string = re.sub(r'[\n]', " ", string) #string = string.replace(noun,noun.title()) for noun in nouns splitted = string.split() for i,s in enumerate(splitted): if s in NOUNS: splitted[i] = s.title() if i != 0: for punct in ":.!?": if punct in splitted[i - 1]: splitted[i] = s.title() string = " ".join(splitted) """ yield string def processDictstream_v2(dictstream, keys_to_clean): for dic in dictstream: result = {k: re.sub(r'[.!?]', "", normalize_str(v).lower()) if k in keys_to_clean else v for k, v in dic.items()} yield result def processDictstream(dictstream, funcdict, parser): """ :param dictstream: dict-gen :param funcdict: clean_in_meta = { "Solution":funclist, ... } :param parser: spacy-parser :return: dict-gen """ for dic in dictstream: result = {} for key, value in dic.items(): if key in funcdict: doc = parser(value) tokens = [tok for tok in doc] funclist = funcdict[key] tokens = filterTokens(tokens, funclist) result[key] = " ".join([tok.lower_ for tok in tokens]) else: result[key] = value yield result def filterTokens(tokens, funclist): # in:tokenlist, funclist # out: tokenlist for f in funclist: tokens = list(filter(f, tokens)) for tok in tokens: if tok.pos_ == "NOUN": x = 0 return tokens ################################################################################################## corpus_de_path = FILEPATH + config.get("de_corpus", "path") def cleanCorpus(corpus,clean_in_meta): logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now())) """ ressources_path = FILEPATH + "ressources/" path2nouns_list = ressources_path + config.get("nouns", "pickle_file") #NOUNS = load_obj(path2nouns_list) #noun_disjunction = '|'.join(NOUNS) #nouns_tuples = [] #for n in NOUNS: # nouns_tuples.append((n.lower(),n)) """ # load Corpus raw_corpus = corpus parser = corpus.spacy_lang # Actually clean the corpus cleaned_corpus = textacy.Corpus(parser) cleaned_corpus.add_texts( clean(corpus2Text(raw_corpus)), #processDictstream(corpus2Meta(cleaned_corpus), clean_in_meta, parser=parser) processDictstream_v2(corpus2Meta(raw_corpus),clean_in_meta) ) # leere docs aus corpus kicken cleaned_corpus.remove(lambda doc: len(doc) == 0) #save corpus cleanCorpus_name = corpus.lang + "_clean" save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name) return cleaned_corpus def removePOS(pos_list): return lambda tok: tok.pos_ not in pos_list def main(corpus): start = time.time() clean_in_meta = { "Solution": [removePOS(["SPACE"])], "Subject": [removePOS(["SPACE", "PUNCT"])], "categoryName": [removePOS(["SPACE", "PUNCT"])] } clean_in_meta = ["Subject", "categoryName" ] cleaned_corpus = cleanCorpus(corpus, clean_in_meta) end = time.time() logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60)) return cleaned_corpus if __name__ == "__main__": corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/", corpus_name="de_raw") main(corpus)