This commit is contained in:
jannis.grundmann 2017-11-29 16:31:30 +01:00
parent 873e9ff7d2
commit 66e4b972eb
5 changed files with 198 additions and 25 deletions

32
aufgaben.txt Normal file
View File

@ -0,0 +1,32 @@
GGrußformeln asm Anfang raus
whitelist (inkl. kb-keywords)
akronyme & abk. drin lassen
tagging vor normalisierung
groß/klein rumexperimetieren
bigramme nicht auf normtext
relevanz bestimmter wörter
zahlen drin lassen
ticket-subj mit einbeziehen
topics nach lda von itmc bestimmen lassen
baumhieracrchie der categrory einbezihen (ggf. datensatz verbessern)
aktuelle technische bgriffe autoimatisch in whitelist aufnehmen
levenstein/hamming distanz statt autokorrekt (wenn kleiner als x dann ists das gleiche wort)
TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
hautpverb (root) drin lassen
kategroien verkleinern: onthologien/ornamigram
Footer/Header raus

13
main.py
View File

@ -31,6 +31,19 @@ start = time.time()
# todo modelle testen # todo modelle testen
logprint("main.py started at {}".format(datetime.now())) logprint("main.py started at {}".format(datetime.now()))

View File

@ -16,6 +16,7 @@ import glob, os
from textacy.fileio import open_sesame from textacy.fileio import open_sesame
import json import json
from spacy.tokens.doc import Doc as SpacyDoc from spacy.tokens.doc import Doc as SpacyDoc
import operator
csv.field_size_limit(sys.maxsize) csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
@ -124,6 +125,10 @@ def list_from_files(*paths):
def breakpoint(): def breakpoint():
pass pass
def sort_dictionary(dict):
return sorted(dict.items(), key=operator.itemgetter(1))
def normalize(string): def normalize(string):
# replaceRockDots # replaceRockDots
string = re.sub(r'[ß]', "ss", string.lower()) string = re.sub(r'[ß]', "ss", string.lower())

View File

@ -296,6 +296,7 @@ path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file") path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file") path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file")
custom_words = get_list_from_config("preprocessing", "custom_words") custom_words = get_list_from_config("preprocessing", "custom_words")

172
test.py
View File

@ -21,18 +21,18 @@ FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
import draw import draw
"""
# load corpus # load corpus
corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_de_path = FILEPATH + config.get("de_corpus", "path")
preCorpus_name = "de" + "_pre_ticket" preCorpus_name = "de" + "_pre_ticket"
corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(corpus.lang)) logprint("Corpus loaded: {0}".format(corpus.lang))
#todo randomize corpus
#todo randomize
split = 0.8 split = 0.8
@ -64,9 +64,10 @@ model.fit(doc_term_matrix)
compenents = model.model.components_ compenents = model.model.components_
"""
components_ : array, [n_components, n_features] components_ : array, [n_components, n_features]
Variational parameters for topic word distribution. Variational parameters for topic word distribution.
@ -78,9 +79,78 @@ the number of times word j was assigned to topic i.
It can also be viewed as distribution over the words for each topic after normalization: It can also be viewed as distribution over the words for each topic after normalization:
model.components_ / model.components_.sum(axis=1)[:, np.newaxis]. model.components_ / model.components_.sum(axis=1)[:, np.newaxis].
"""
test_doc = corpus_test[0] test_doc = corpus_test[0]
bla = test_doc.to_bag_of_terms(ngrams=1, named_entities=True, normalize=u'lower', lemmatize=None, lowercase=True, weighting=u'count', as_strings=False)
key_list = bla.keys()
bla_list = list(bla)
print(bla)
print(bla_list)
for k in bla.keys():
print(id2term[k])
"""
"""
ressources_path = FILEPATH + "ressources/"
path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
DE_STOP_WORDS = load_obj(path2DEstopwordlist)
# load corpus
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
rawCorpus_name = "de" + "_raw_ticket"
corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
#parser = spacy.load("de")
#corpus = textacy.Corpus(parser)
#TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
"""
testtxt = "Sehr geehrtes ITMC Service Team,\r\n\r\nseit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen" \
" an das Intranet der BMP mit der Dosennummer G1 303/04/12.05 (G1 4 26-1) in Raum G1-426 nicht mehr zu funktionieren. " \
"\r\nIch würde Sie daher bitten diese Mail an den zuständigen Kollegen weiterzuleiten, um die Leitung vielleicht einmal zu Prüfen.\r\n\r\n" \
"Des Weiteren hätte ich noch eine Frage bezüglich der Möglichkeit zur Nutzung einer VPN Verbindung aus" \
" unserem Intranet heraus zu einem fremden Netzwerk. Dies ist zwar über das WLAN-Netz möglich, jedoch nicht " \
"aus unserem Netzwerk heraus. Vielleicht können Sie mir mitteilen an welchen Kollegen ich mich bezüglich" \
" dieses Problem wenden kann.\r\n\r\nBei Rückfragen stehe ich gerne zur Verfügung!\r\n\r\nBeste Grüße," \
"\r\n\r\nNicolas Rauner\r\n\r\nLS Biomaterialien und Polymerwissenschaften\r\nFakultät Bio- und Chemieingenieurwesen\r\nTU Dortmund" \
" \r\nD-44227 Dortmund\r\n\r\nTel: + 49-(0)231 / 755 - 3015\r\nFax: + 49-(0)231 / 755 - 2480\r\n\r\nwww.ls-bmp.de <http://www.ls-bmp.de/>"
#corpus.add_text(testtxt)
"""
term_dict_w_stop = {}
term_dict_wo_stop = {}
footings = ["gruss", "grusse", "gruesse", "gruessen", "grusses"]
for doc in corpus:
tokens = [tok for tok in doc]
# footer raus
for i,tok in enumerate(tokens):
text = tok.text
text = re.sub(r'[ß]', "ss", text)
text = re.sub(r'[ö]', "oe", text)
text = re.sub(r'[ü]', "ue", text)
text = re.sub(r'[ä]', "ae", text)
for gr in footings:
if gr in text.lower():
tokens = tokens[0:i]
#print(tokens)
break
@ -88,12 +158,85 @@ test_doc = corpus_test[0]
for i,tok in enumerate(tokens):
if tok.is_space or tok.is_punct or tok.like_url or tok.like_email:
continue
if i is not 0:
#text = tok.text if tokens[i-1].pos_ is not "NUM" else tok.text+" "+tokens[i-1].text
if tokens[i-1].like_num:
text = tokens[i - 1].text + " " + tok.text
else:
text = tok.text
else:
text = tok.text
# replaceRockDots
text = re.sub(r'[ß]', "ss", text)
text = re.sub(r'[ö]', "oe", text)
text = re.sub(r'[ü]', "ue", text)
text = re.sub(r'[ä]', "ae", text)
if text not in term_dict_w_stop.keys():
term_dict_w_stop[text] = 1
else:
term_dict_w_stop[text] += 1
if text.lower() not in DE_STOP_WORDS:
if text not in term_dict_wo_stop.keys():
term_dict_wo_stop[text] = 1
else:
term_dict_wo_stop[text] += 1
term_dict_sorted = sort_dictionary(term_dict_w_stop)
term_dict_wo_sorted = sort_dictionary(term_dict_wo_stop)
split_value = 0.2
from_ = int((1-split_value) * float(len(term_dict_sorted))) #1-splt
to_ = len(term_dict_sorted)
#logprint(term_dict_sorted[from_: to_])
#logprint("\n")
#logprint(term_dict_wo_sorted[from_: to_])
for elem in term_dict_sorted:
logprint(elem)
logprint("\n")
logprint("\n")
logprint("\n")
logprint("\n")
for elem in term_dict_wo_sorted:
logprint(elem)
"""
in_path= "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/terms_without_stop.txt"
out_path= "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/terms_without_stopwords.txt"
gen=reversed(list(open(in_path)))
textacy.fileio.write_file_lines(gen,out_path)
@ -149,27 +292,6 @@ print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start))
""" """