topicModelingTickets/cleaning.py

174 lines
3.6 KiB
Python
Raw Normal View History

2017-10-25 09:46:44 +02:00
# -*- coding: utf-8 -*-
2017-12-08 11:06:07 +01:00
import os
2017-10-25 09:46:44 +02:00
import time
2017-12-08 11:06:07 +01:00
from datetime import datetime
2017-10-25 09:46:44 +02:00
import textacy
from scipy import *
2017-12-08 11:06:07 +01:00
from miscellaneous import *
2017-10-25 09:46:44 +02:00
2017-11-06 12:54:59 +01:00
2017-10-25 09:46:44 +02:00
csv.field_size_limit(sys.maxsize)
2017-12-08 11:06:07 +01:00
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
2017-10-25 09:46:44 +02:00
# load config
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
############# stringcleaning
2017-12-08 11:06:07 +01:00
def clean(stringstream):#, NOUNS):
"""
fix bad unicode
seperate_words_on_regex `\=~%^&*()_+\[\]{};\'"|</>
normalize whitespace
remove linebreaks
replaceRockDöts
:param stringstream: str-gen
:return: string-gen
"""
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
#NOUNS = [n.lower() for n in NOUNS]
2017-10-25 09:46:44 +02:00
for string in stringstream:
# fixUnicode
2017-12-08 11:06:07 +01:00
string = textacy.preprocess.fix_bad_unicode(string)
#string = textacy.preprocess.unidecode(string)
2017-10-25 09:46:44 +02:00
# seperate_words_on_regex:
2017-12-08 11:06:07 +01:00
string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|</>]').split(string))
2017-10-25 09:46:44 +02:00
#normalize whitespace
string = textacy.preprocess.normalize_whitespace(string)
#remove linebreaks
string = re.sub(r'[\n]', " ", string)
2017-12-08 11:06:07 +01:00
string = replaceRockDots(string)
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
"""
# fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden.
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
#for n in nouns:
# string = string.replace(n.lower(),n)
#string = multisub(nouns_tuples,string)
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
#https://stackoverflow.com/questions/10968558/python-re-sub-with-a-list-of-words-to-find
#string = re.sub(r'[\n]', " ", string)
#string = string.replace(noun,noun.title()) for noun in nouns
splitted = string.split()
for i,s in enumerate(splitted):
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
if s in NOUNS:
splitted[i] = s.title()
if i != 0:
for punct in ":.!?":
if punct in splitted[i - 1]:
splitted[i] = s.title()
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
string = " ".join(splitted)
"""
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
yield string
2017-10-25 09:46:44 +02:00
##################################################################################################
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
2017-12-08 11:06:07 +01:00
def cleanCorpus(corpus):
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
2017-10-25 09:46:44 +02:00
"""
2017-12-08 11:06:07 +01:00
ressources_path = FILEPATH + "ressources/"
2017-12-08 11:06:07 +01:00
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
2017-12-08 11:06:07 +01:00
#NOUNS = load_obj(path2nouns_list)
#noun_disjunction = '|'.join(NOUNS)
#nouns_tuples = []
#for n in NOUNS:
# nouns_tuples.append((n.lower(),n))
"""
2017-10-25 09:46:44 +02:00
# load Corpus
2017-12-08 11:06:07 +01:00
raw_corpus = corpus
parser = corpus.spacy_lang
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
# Actually clean the corpus
cleaned_corpus = textacy.Corpus(parser)
cleaned_corpus.add_texts(
clean(corpus2Text(raw_corpus)),
corpus2Meta(raw_corpus)
2017-10-25 09:46:44 +02:00
)
# leere docs aus corpus kicken
2017-12-08 11:06:07 +01:00
cleaned_corpus.remove(lambda doc: len(doc) == 0)
2017-10-25 09:46:44 +02:00
#save corpus
cleanCorpus_name = corpus.lang + "_clean"
2017-12-08 11:06:07 +01:00
save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name)
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
return cleaned_corpus
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
def main(corpus):
2017-10-25 09:46:44 +02:00
start = time.time()
2017-12-08 11:06:07 +01:00
cleaned_corpus = cleanCorpus(corpus)
2017-10-25 09:46:44 +02:00
end = time.time()
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))
2017-12-08 11:06:07 +01:00
return cleaned_corpus
2017-10-25 09:46:44 +02:00
if __name__ == "__main__":
2017-12-08 11:06:07 +01:00
corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/",
corpus_name="de_raw")
main(corpus)
2017-10-25 09:46:44 +02:00