2017-10-25 09:46:44 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
import os
|
2017-10-25 09:46:44 +02:00
|
|
|
import time
|
2017-12-08 11:06:07 +01:00
|
|
|
from datetime import datetime
|
2017-10-25 09:46:44 +02:00
|
|
|
import textacy
|
|
|
|
from scipy import *
|
2017-12-08 11:06:07 +01:00
|
|
|
from miscellaneous import *
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
|
2017-11-06 12:54:59 +01:00
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
csv.field_size_limit(sys.maxsize)
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
# load config
|
|
|
|
config_ini = FILEPATH + "config.ini"
|
|
|
|
|
|
|
|
config = ConfigParser.ConfigParser()
|
|
|
|
with open(config_ini) as f:
|
|
|
|
config.read_file(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
############# stringcleaning
|
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
def clean(stringstream):#, NOUNS):
|
2017-12-11 12:10:40 +01:00
|
|
|
"""
|
|
|
|
fix bad unicode
|
|
|
|
seperate_words_on_regex `\=~%^&*()_+\[\]{};\'"|</>
|
|
|
|
normalize whitespace
|
|
|
|
remove linebreaks
|
|
|
|
replaceRockDöts
|
|
|
|
|
|
|
|
:param stringstream: str-gen
|
|
|
|
:return: string-gen
|
|
|
|
"""
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
#NOUNS = [n.lower() for n in NOUNS]
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
for string in stringstream:
|
|
|
|
# fixUnicode
|
2017-12-08 11:06:07 +01:00
|
|
|
string = textacy.preprocess.fix_bad_unicode(string)
|
|
|
|
#string = textacy.preprocess.unidecode(string)
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
# seperate_words_on_regex:
|
2017-12-08 11:06:07 +01:00
|
|
|
string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|</>]').split(string))
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
#normalize whitespace
|
|
|
|
string = textacy.preprocess.normalize_whitespace(string)
|
|
|
|
|
|
|
|
#remove linebreaks
|
|
|
|
string = re.sub(r'[\n]', " ", string)
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
string = replaceRockDots(string)
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
"""
|
|
|
|
# fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden.
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
#for n in nouns:
|
|
|
|
# string = string.replace(n.lower(),n)
|
|
|
|
#string = multisub(nouns_tuples,string)
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
#https://stackoverflow.com/questions/10968558/python-re-sub-with-a-list-of-words-to-find
|
|
|
|
#string = re.sub(r'[\n]', " ", string)
|
|
|
|
#string = string.replace(noun,noun.title()) for noun in nouns
|
|
|
|
|
|
|
|
splitted = string.split()
|
|
|
|
for i,s in enumerate(splitted):
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
if s in NOUNS:
|
|
|
|
splitted[i] = s.title()
|
|
|
|
if i != 0:
|
|
|
|
for punct in ":.!?":
|
|
|
|
if punct in splitted[i - 1]:
|
|
|
|
splitted[i] = s.title()
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
|
|
|
|
string = " ".join(splitted)
|
|
|
|
"""
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
yield string
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
##################################################################################################
|
|
|
|
|
|
|
|
|
|
|
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
def cleanCorpus(corpus):
|
|
|
|
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-12-11 12:10:40 +01:00
|
|
|
"""
|
2017-12-08 11:06:07 +01:00
|
|
|
ressources_path = FILEPATH + "ressources/"
|
2017-12-11 12:10:40 +01:00
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
|
2017-12-11 12:10:40 +01:00
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
#NOUNS = load_obj(path2nouns_list)
|
|
|
|
#noun_disjunction = '|'.join(NOUNS)
|
|
|
|
#nouns_tuples = []
|
|
|
|
#for n in NOUNS:
|
|
|
|
# nouns_tuples.append((n.lower(),n))
|
2017-12-11 12:10:40 +01:00
|
|
|
"""
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
|
2017-12-11 12:10:40 +01:00
|
|
|
# load Corpus
|
2017-12-08 11:06:07 +01:00
|
|
|
raw_corpus = corpus
|
|
|
|
parser = corpus.spacy_lang
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
# Actually clean the corpus
|
|
|
|
cleaned_corpus = textacy.Corpus(parser)
|
|
|
|
cleaned_corpus.add_texts(
|
|
|
|
clean(corpus2Text(raw_corpus)),
|
|
|
|
corpus2Meta(raw_corpus)
|
2017-10-25 09:46:44 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2017-12-11 12:10:40 +01:00
|
|
|
# leere docs aus corpus kicken
|
2017-12-08 11:06:07 +01:00
|
|
|
cleaned_corpus.remove(lambda doc: len(doc) == 0)
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#save corpus
|
2017-12-11 12:10:40 +01:00
|
|
|
cleanCorpus_name = corpus.lang + "_clean"
|
2017-12-08 11:06:07 +01:00
|
|
|
save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name)
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
return cleaned_corpus
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
def main(corpus):
|
2017-10-25 09:46:44 +02:00
|
|
|
start = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
cleaned_corpus = cleanCorpus(corpus)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
end = time.time()
|
|
|
|
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))
|
2017-12-08 11:06:07 +01:00
|
|
|
return cleaned_corpus
|
|
|
|
|
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2017-12-08 11:06:07 +01:00
|
|
|
corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/",
|
|
|
|
corpus_name="de_raw")
|
|
|
|
|
|
|
|
main(corpus)
|
2017-10-25 09:46:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|