.

2017-11-29 16:31:30 +01:00 · 2017-11-29 16:31:30 +01:00 · 66e4b972eb
parent 873e9ff7d2
commit 66e4b972eb
5 changed files with 198 additions and 25 deletions
--- a/aufgaben.txt
+++ b/aufgaben.txt
@ -0,0 +1,32 @@
 GGrußformeln asm Anfang raus
 whitelist (inkl. kb-keywords)
 akronyme & abk. drin lassen
 tagging vor normalisierung
 groß/klein rumexperimetieren
 bigramme nicht auf normtext
 relevanz bestimmter wörter
 zahlen drin lassen
 ticket-subj mit einbeziehen
 topics nach lda von itmc bestimmen lassen
 baumhieracrchie der categrory einbezihen (ggf. datensatz verbessern)
 aktuelle technische bgriffe autoimatisch in whitelist aufnehmen
 levenstein/hamming distanz statt autokorrekt (wenn kleiner als x dann ists das gleiche wort)
 TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
 hautpverb (root) drin lassen
 kategroien verkleinern: onthologien/ornamigram
 Footer/Header raus
--- a/main.py
+++ b/main.py
@ -31,6 +31,19 @@ start = time.time()
 # todo modelle testen
 logprint("main.py started at {}".format(datetime.now()))
--- a/miscellaneous.py
+++ b/miscellaneous.py
@ -16,6 +16,7 @@ import glob, os
 from textacy.fileio import open_sesame
 import json
 from spacy.tokens.doc import Doc as SpacyDoc
 import operator
 csv.field_size_limit(sys.maxsize)
 FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
@ -124,6 +125,10 @@ def list_from_files(*paths):
 def breakpoint():
    pass
 def sort_dictionary(dict):
    return sorted(dict.items(), key=operator.itemgetter(1))
 def normalize(string):
    # replaceRockDots
    string = re.sub(r'[ß]', "ss", string.lower())
--- a/preprocessing.py
+++ b/preprocessing.py
@ -296,6 +296,7 @@ path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
 path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
 path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file")
 custom_words = get_list_from_config("preprocessing", "custom_words")
--- a/test.py
+++ b/test.py
@ -21,18 +21,18 @@ FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
 import draw
-
+"""
 # load  corpus
 corpus_de_path = FILEPATH + config.get("de_corpus", "path")
 preCorpus_name = "de" + "_pre_ticket"
 corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
 logprint("Corpus loaded: {0}".format(corpus.lang))
 #todo randomize corpus
 #todo randomize
 split = 0.8
@ -64,9 +64,10 @@ model.fit(doc_term_matrix)
 compenents = model.model.components_
-"""
+
 components_ : array, [n_components, n_features]
 Variational parameters for topic word distribution.
@ -78,9 +79,78 @@ the number of times word j was assigned to topic i.
 It can also be viewed as distribution over the words for each topic after normalization: 
 model.components_ / model.components_.sum(axis=1)[:, np.newaxis].
-"""
+
 test_doc = corpus_test[0]
 bla = test_doc.to_bag_of_terms(ngrams=1, named_entities=True, normalize=u'lower', lemmatize=None, lowercase=True, weighting=u'count', as_strings=False)
 key_list = bla.keys()
 bla_list = list(bla)
 print(bla)
 print(bla_list)
 for k in bla.keys():
    print(id2term[k])
 """
 """
 ressources_path = FILEPATH +  "ressources/"
 path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
 DE_STOP_WORDS = load_obj(path2DEstopwordlist)
 # load  corpus
 corpus_de_path = FILEPATH + config.get("de_corpus", "path")
 rawCorpus_name = "de" + "_raw_ticket"
 corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
 #parser = spacy.load("de")
 #corpus = textacy.Corpus(parser)
 #TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
 """
 testtxt = "Sehr geehrtes ITMC Service Team,\r\n\r\nseit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen" \
          " an das Intranet der BMP mit der Dosennummer G1 303/04/12.05 (G1 4 26-1) in Raum G1-426 nicht mehr zu funktionieren. " \
          "\r\nIch würde Sie daher bitten diese Mail an den zuständigen Kollegen weiterzuleiten, um die Leitung vielleicht einmal zu Prüfen.\r\n\r\n" \
          "Des Weiteren hätte ich noch eine Frage bezüglich der Möglichkeit zur Nutzung einer VPN Verbindung aus" \
          " unserem Intranet heraus zu einem fremden Netzwerk. Dies ist zwar über das WLAN-Netz möglich, jedoch nicht " \
          "aus unserem Netzwerk heraus. Vielleicht können Sie mir mitteilen an welchen Kollegen ich mich bezüglich" \
          " dieses Problem wenden kann.\r\n\r\nBei Rückfragen stehe ich gerne zur Verfügung!\r\n\r\nBeste Grüße," \
          "\r\n\r\nNicolas Rauner\r\n\r\nLS Biomaterialien und Polymerwissenschaften\r\nFakultät Bio- und Chemieingenieurwesen\r\nTU Dortmund" \
          " \r\nD-44227 Dortmund\r\n\r\nTel: + 49-(0)231 / 755 - 3015\r\nFax: + 49-(0)231 / 755 - 2480\r\n\r\nwww.ls-bmp.de <http://www.ls-bmp.de/>"
 #corpus.add_text(testtxt)
 """
 term_dict_w_stop = {}
 term_dict_wo_stop = {}
 footings = ["gruss", "grusse", "gruesse", "gruessen", "grusses"]
 for doc in corpus:
    tokens = [tok for tok in doc]
    # footer raus
    for i,tok in enumerate(tokens):
        text = tok.text
        text = re.sub(r'[ß]', "ss", text)
        text = re.sub(r'[ö]', "oe", text)
        text = re.sub(r'[ü]', "ue", text)
        text = re.sub(r'[ä]', "ae", text)
        for gr in footings:
            if gr in text.lower():
                tokens = tokens[0:i]
                #print(tokens)
                break
@ -88,12 +158,85 @@ test_doc = corpus_test[0]
    for i,tok in enumerate(tokens):
        if tok.is_space or tok.is_punct or tok.like_url or tok.like_email:
            continue
        if i is not 0:
            #text = tok.text if tokens[i-1].pos_ is not "NUM" else tok.text+" "+tokens[i-1].text
            if tokens[i-1].like_num:
                text = tokens[i - 1].text + " " + tok.text
            else:
                text = tok.text
        else:
            text = tok.text
        # replaceRockDots
        text = re.sub(r'[ß]', "ss", text)
        text = re.sub(r'[ö]', "oe", text)
        text = re.sub(r'[ü]', "ue", text)
        text = re.sub(r'[ä]', "ae", text)
        if text not in term_dict_w_stop.keys():
            term_dict_w_stop[text] = 1
        else:
            term_dict_w_stop[text] += 1
        if text.lower() not in DE_STOP_WORDS:
            if text not in term_dict_wo_stop.keys():
                term_dict_wo_stop[text] = 1
            else:
                term_dict_wo_stop[text] += 1
 term_dict_sorted = sort_dictionary(term_dict_w_stop)
 term_dict_wo_sorted = sort_dictionary(term_dict_wo_stop)
 split_value = 0.2
 from_ = int((1-split_value) * float(len(term_dict_sorted))) #1-splt
 to_ = len(term_dict_sorted)
 #logprint(term_dict_sorted[from_: to_])
 #logprint("\n")
 #logprint(term_dict_wo_sorted[from_: to_])
 for elem in term_dict_sorted:
    logprint(elem)
 logprint("\n")
 logprint("\n")
 logprint("\n")
 logprint("\n")
 for elem in term_dict_wo_sorted:
    logprint(elem)
 """
 in_path= "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/terms_without_stop.txt"
 out_path= "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/terms_without_stopwords.txt"
 gen=reversed(list(open(in_path)))
 textacy.fileio.write_file_lines(gen,out_path)
@ -149,27 +292,6 @@ print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start))
 """
`@ -31,6 +31,19 @@ start = time.time()`
	`# todo modelle testen`	`# todo modelle testen`















	`logprint("main.py started at {}".format(datetime.now()))`	`logprint("main.py started at {}".format(datetime.now()))`