regex zum weiteren cleaning hinzugefügt

2017-09-15 14:32:44 +02:00 · 2017-09-15 14:32:44 +02:00 · 092052dfe1
parent 13ec7cdef4
commit 092052dfe1
9 changed files with 358642 additions and 67 deletions
--- a/config.ini
+++ b/config.ini
@ -1,7 +1,19 @@
-[default]
+[filepath]
 thesauruspath = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv
 path2xml = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ticketSamples.xml
 path2csv = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv
 small = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
 logfile = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
 lemmas = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmatization-de.txt
 thesauruspath = openthesaurus.csv
 path2xml = ticketSamples.xml
 language = de
@ -9,7 +21,7 @@ language = de
 ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
-custom_words = grüßen,fragen
+custom_words = grüßen,fragen,damen,probleme,herren,dank
 #lemmatize = True
--- a/java_LabledLDA/models/tickets/.tassign.gz
+++ b/java_LabledLDA/models/tickets/.tassign.gz
--- a/java_LabledLDA/models/tickets/.theta.gz
+++ b/java_LabledLDA/models/tickets/.theta.gz
--- a/java_LabledLDA/models/tickets/.twords.gz
+++ b/java_LabledLDA/models/tickets/.twords.gz
--- a/java_LabledLDA/models/tickets/.wordmap.gz
+++ b/java_LabledLDA/models/tickets/.wordmap.gz
--- a/java_LabledLDA/models/tickets/tickets.gz
+++ b/java_LabledLDA/models/tickets/tickets.gz
--- a/lemmatization-de.txt
+++ b/lemmatization-de.txt
--- a/preprocessing.py
+++ b/preprocessing.py
@ -22,7 +22,7 @@ config = ConfigParser.ConfigParser()
 with open("config.ini") as f:
    config.read_file(f)
-
+#todo print&log
 path2xml = config.get("default","path2xml")
@ -281,7 +281,7 @@ topicModel = 'lda'
 weighting = ('tf' if topicModel == 'lda' else 'tfidf')
 top_topic_words = 5
-top_document_labels_per_topic = 2
+top_document_labels_per_topic = 5
 n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
@ -311,7 +311,7 @@ id2term = vectorizer.__getattribute__("id_to_term")
-
+"""
 #####################     LSA, LDA, NMF         Topic Modeling via Textacy         ##############################################
 # Initialize and train a topic model
@ -339,7 +339,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
 print()
 print()
-
+"""
--- a/test.py
+++ b/test.py
@ -1,7 +1,11 @@
 # -*- coding: utf-8 -*-
 import time
 start = time.time()
 import logging
 import csv
 import functools
@ -16,45 +20,55 @@ import textacy
 from scipy import *
 from textacy import Vectorizer
 import warnings
-csv.field_size_limit(sys.maxsize)
+import configparser as ConfigParser
 import sys
-old_stdout = sys.stdout
+csv.field_size_limit(sys.maxsize)
 log_file = open("printout.log","w")
 sys.stdout = log_file
 # Load the configuration file
-import configparser as ConfigParser
+config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
 config = ConfigParser.ConfigParser()
-with open("config.ini") as f:
+with open(config_ini) as f:
    config.read_file(f)
 # config logging
 logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
 path2csv = "M42-Export/Tickets_small.csv"
 #path2csv = "M42-Export/Tickets_2017-09-13.csv" # 21167
 path2xml = config.get("default","path2xml")
 thesauruspath = config.get("default","thesauruspath")
 DE_PARSER = spacy.load("de")
 de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
 thesauruspath = config.get("filepath","thesauruspath")
 THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
 DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
 de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
 LEMMAS=config.get("filepath","lemmas")
 ############# misc
 def printlog(string, level="INFO"):
    """log and prints"""
    print(string)
    if level=="INFO":
        logging.info(string)
    elif level=="DEBUG":
        logging.debug(string)
    elif level == "WARNING":
        logging.warning(string)
 printlog("Load functions")
 def compose(*functions):
    def compose2(f, g):
        return lambda x: f(g(x))
@ -84,17 +98,17 @@ def get_calling_function():
                return func
    raise AttributeError("func not found")
 def printRandomDoc(textacyCorpus):
    import random
    print()
-    print("len(textacyCorpus) = %i" % len(textacyCorpus))
+    printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
    randIndex = int((len(textacyCorpus) - 1) * random.random())
-    print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
+    printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
    print()
 #############  load xml
 def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
    """
@ -111,6 +125,7 @@ def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
        for field in ticket:
            if field.tag == main_textfield:
                    yield field.text
 def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
    root = tree.getroot()
@ -144,6 +159,7 @@ def csv_to_contentStream(path2csv: str, content_collumn_name: str):
                    content_collumn = j
        else:
            yield lst[content_collumn]
 def csv_to_metaStream(path2csv: str, metalist: [str]):
    """
    :param path2csv: string
@ -171,9 +187,8 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
 ############################################    Preprocessing   ##############################################
 print("############################################    Preprocessing   ##############################################")
 print()
 #############  on str-gen
@ -211,7 +226,11 @@ def processTextstream(textstream, funclist, parser=DE_PARSER):
    pipe = parser.pipe(textstream)
    for doc in pipe:
-        tokens = [tok for tok in doc]
+
        tokens = []
        for tok in doc:
            tokens.append(tok)
        tokens = processTokens(tokens,funclist,parser)
        yield " ".join([tok.lower_ for tok in tokens])
@ -264,7 +283,6 @@ def removePOS(pos_list)-> bool:
    return ret
 def removeWords(words, keep=None)-> bool:
    #todo in:str oder str-list
    if hasattr(keep, '__iter__'):
        for k in keep:
            try:
@ -289,13 +307,37 @@ def removeENT(ent_list) -> bool:
    ret.__annotations__ = get_calling_function().__annotations__
    return ret
-def lemmatize() -> str:
+def remove_words_containing_Numbers() -> bool:
-    ret = lambda tok:  tok.lemma_
+    ret = lambda tok: not bool(re.search('\d', tok.lower_))
    ret.__annotations__ = get_calling_function().__annotations__
    return ret
 def remove_words_containing_specialCharacters() -> bool:
    ret = lambda tok: not bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', tok.lower_))
    ret.__annotations__ = get_calling_function().__annotations__
    return ret
 def lemmatizeWord(word,filepath=LEMMAS):
    """http://www.lexiconista.com/datasets/lemmatization/"""
    for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
        if word.lower() == line.split()[1].strip().lower():
            return line.split()[0].strip().lower()
    return word.lower() # falls nix gefunden wurde
 def lemmatize() -> str:
    ret = lambda tok: lemmatizeWord(tok.lower_)
    ret.__annotations__ = get_calling_function().__annotations__
    return ret
 def normalizeSynonyms(default_return_first_Syn=False) -> str:
    ret = lambda tok : getFirstSynonym(tok.lower_, default_return_first_Syn=default_return_first_Syn)
@ -377,6 +419,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
    return ret
 def fixUnicode() -> str:
    ret = lambda tok: textacy.preprocess.fix_bad_unicode(tok.lower_, normalization=u'NFC')
    ret.__annotations__ = get_calling_function().__annotations__
    return ret
 def resolveAbbreviations():
    pass #todo
@ -396,10 +447,44 @@ def lower() -> spacy.tokens.Doc:
    return ret
 ################################################################################################################
 path2xml = config.get("filepath","path2xml")
 path2csv = config.get("filepath","path2csv")
 path2csv = "M42-Export/Tickets_med.csv"
 printlog("CSV: {0}".format(path2csv))
 ticketcorpus = textacy.Corpus(DE_PARSER)
 #idee ß zu ss ändern? prinzipiell?
 """
 vllt kategorien in unterkategorien aufteilen 
 allg: 
 utf-korregieren, bei sonderzeichen wörter trennen
 namen raus
 emails, urls, nummern raus 
 vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
 sinnvoller wörter von müll trennen: 8203;verfügung
 abkürzungen raus: m.a, o.ä.     
 sinnlose bsp: nr54065467  455a33c5   tvt?=      ------problem--------
 """
 metaliste = [
    "Subject",
@ -409,45 +494,55 @@ metaliste = [
 clean_in_meta = {
    "Solution":[removePOS(["SPACE"]),lower()],
-    "Subject":[removePOS(["SPACE","PUNCT"]),lower()]
+    "Subject":[removePOS(["SPACE","PUNCT"]),lower()],
    "categoryName": [removePOS(["SPACE", "PUNCT"]), lower()]
 }
-
+printlog("Start Preprocessing")
 clean_in_content=[
-    #removePOS(["SPACE","PUNCT","NUM"]),
+
    keepPOS(["NOUN"]),
    removePOS(["SPACE","PUNCT","NUM"]),
    remove_words_containing_Numbers(),
    remove_words_containing_specialCharacters(),
    #replaceURLs(),
    #replaceEmails(),
-    lemmatize(),
+    #fixUnicode(),
-    #removeWords(de_stop_words),
+
-    keepUniqeTokens(),
+    #lemmatize(),
-    removePOS(["PUNCT"]),
+    #removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),
    #removeENT("PERSON"),
    #keepPOS(["NOUN"]),
    #keepUniqeTokens(),
    #keepENT(config.get("preprocessing","ents2keep"))
 ]
 ## add files to textacy-corpus,
-print("add texts to textacy-corpus...")
+printlog("add texts to textacy-corpus")
 ticketcorpus.add_texts(
    processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
    processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
 )
-printRandomDoc(ticketcorpus)
+for i in range(10):
    printRandomDoc(ticketcorpus)
 end = time.time()
-print("\n\nTime Elapsed Preprocessing:{0}\n\n".format(end - start))
+printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
-print("############################################    Topic Modeling   #############################################")
+############################################    Topic Modeling   #############################################
 print("\n\n")
 start = time.time()
@ -482,7 +577,7 @@ def generate_labled_lines(textacyCorpus):
 ####################'####################'    todo alles in config
-ngrams = (1)
+ngrams = 1
 min_df = 0
 max_df = 1.0
@ -508,7 +603,7 @@ n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen
-print("\nvectorize corpus...")
+printlog("vectorize corpus...")
 vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
 terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
@ -528,12 +623,12 @@ id2term = vectorizer.__getattribute__("id_to_term")
 #####################     LSA, LDA, NMF         Topic Modeling via Textacy         ##############################################
 # Initialize and train a topic model
-print("\nInitialize and train a topic model..")
+printlog("Initialize and train a topic model..")
 model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
 model.fit(doc_term_matrix)
 #Transform the corpus and interpret our model:
-print("Transform the corpus and interpret our model..")
+printlog("Transform the corpus and interpret our model..")
 doc_topic_matrix = model.transform(doc_term_matrix)
 print()
@ -544,15 +639,15 @@ for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=t
 print()
 for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
-     print(topic_idx)
+    print(topic_idx)
-     for j in top_docs:
+    for j in top_docs:
-        print(ticketcorpus[j].metadata['categoryName'])
+         print(ticketcorpus[j].metadata['categoryName'])
 #####################################################################################################################
 print()
 print()
-
+"""
@ -560,19 +655,20 @@ print()
-jgibbsLLDA_root = "java_LabledLDA/"
+jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
-filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
+LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
 #create file
-textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus),filepath=filepath)
+textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus), filepath=LLDA_filepath)
 # wait for file to exist
-while not os.path.exists(filepath):
+while not os.path.exists(LLDA_filepath):
    time.sleep(1)
-print("\nstart LLDA:\n")
+print("\n\n")
 printlog("start LLDA:")
 #run JGibsslda file
 FNULL = open(os.devnull, 'w') # supress output
 subprocess.call(["java",
@ -586,7 +682,7 @@ subprocess.call(["java",
 # ANMERKUNG: Dateien sind versteckt. zu finden in models/
-#print twords
+#twords
 subprocess.call(["gzip",
                 "-dc",
                 "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
@ -594,7 +690,7 @@ subprocess.call(["gzip",
 print()
 print()
-
+"""
@ -602,10 +698,4 @@ print()
 end = time.time()
-print("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
+printlog("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
 sys.stdout = old_stdout
 log_file.close()