preprocessing überarbeitet

This commit is contained in:
jannis.grundmann 2017-12-08 11:06:07 +01:00
parent 66e4b972eb
commit db7ea1a72a
18 changed files with 849 additions and 317 deletions

View File

@ -79,3 +79,29 @@ WD2, R. 112
Dezernat 2 Hochschulentwicklung
Abteilung 2.3 Organisationsentwicklung
E-Mail: jan.hustadt@tu-dortmund.de";"Herr Alexev Swetlomier (HIWI) küümert sich bereits um das Laptop und Frau Herbst weiß auch Bescheid die zur Zeit im Urlaub ist"
"INC40484";"Defekte Netzwerkdose / Frage zu VPN";"13.08.2015 14:25:50";"LAN";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Sehr geehrtes ITMC Service Team,
seit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen an das Intranet der BMP mit der Dosennummer G1 303/04/12.05 (G1 4 26-1) in Raum G1-426 nicht mehr zu funktionieren.
Ich würde Sie daher bitten diese Mail an den zuständigen Kollegen weiterzuleiten, um die Leitung vielleicht einmal zu Prüfen.
Des Weiteren hätte ich noch eine Frage bezüglich der Möglichkeit zur Nutzung einer VPN Verbindung aus unserem Intranet heraus zu einem fremden Netzwerk. Dies ist zwar über das WLAN-Netz möglich, jedoch nicht aus unserem Netzwerk heraus. Vielleicht können Sie mir mitteilen an welchen Kollegen ich mich bezüglich dieses Problem wenden kann.
Bei Rückfragen stehe ich gerne zur Verfügung!
Beste Grüße,
Nicolas Rauner
LS Biomaterialien und Polymerwissenschaften
Fakultät Bio- und Chemieingenieurwesen
TU Dortmund
D-44227 Dortmund
Tel: + 49-(0)231 / 755 - 3015
Fax: + 49-(0)231 / 755 - 2480
www.ls-bmp.de <http://www.ls-bmp.de/>";"Hallo Herr Rauner,
die Netzwerkdose weist z. Z. keine Verbindungsprobleme auf. Falls doch welche bestehen, melden Sie sich bitte bei uns.
Mit freunldichen Grüßen
Aicha Oikrim"

1 TicketNumber Subject CreatedDate categoryName Impact Urgency BenutzerID VerantwortlicherID EigentuemerID Description Solution
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

View File

@ -1,32 +1,45 @@
GGrußformeln asm Anfang raus
whitelist (inkl. kb-keywords)
akronyme & abk. drin lassen
tagging vor normalisierung
bigramme nicht auf pre, sondern auf cleaned
groß/klein rumexperimetieren
bigramme nicht auf normtext
relevanz bestimmter wörter
zahlen drin lassen
ticket-subj mit einbeziehen
topics nach lda von itmc bestimmen lassen
baumhieracrchie der categrory einbezihen (ggf. datensatz verbessern)
aktuelle technische bgriffe autoimatisch in whitelist aufnehmen
zahlen drin lassen, bigramme: NUM wort kombis
levenstein/hamming distanz statt autokorrekt (wenn kleiner als x dann ists das gleiche wort)
TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
ticket-subj mit einbeziehen
# lizenzen mit in whitelist
relevanz bestimmter wörter ???
toics nach lda von itmc bestimmen lassen
baumhieracrchie der categrory einbezihen (ggf. datensatz verbessern)
aktuelle technische bgriffe autoimatisch in whitelist aufnehmen
kategroien verkleinern: onthologien/ornamigram
### Getan:
tagging vor normalisierung
groß/klein rumexperimetieren: # kritisch. ändert pos-tagging. laut termliste wird aber drauf geachtet idee anhand liste o.ä. richtige großschreibung fehler --> geht nicht, in liste auch nicht-immer-nomen
GGrußformeln asm Anfang raus
whitelist (inkl. kb-keywords)
hautpverb (root) drin lassen
kategroien verkleinern: onthologien/ornamigram
bsp: "gesperrt" adj und verben drin lassen?
Footer/Header raus
Footer/Header raus

View File

@ -1,24 +1,18 @@
# -*- coding: utf-8 -*-
from datetime import datetime
import csv
import sys
from miscellaneous import *
from datetime import datetime
import os
import time
from datetime import datetime
import textacy
from scipy import *
from miscellaneous import *
import os
from preprocessing import removePOS
from preprocessing import filterTokens
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = FILEPATH + "config.ini"
@ -27,78 +21,25 @@ with open(config_ini) as f:
config.read_file(f)
REGEX_SPECIALCHAR = r'[`\=~%^&*()_+\[\]{};\'"|</>]' #+r',.-\\:' #+r',.?!'
WORDS= {}
########################## Spellchecking ##########################################
# http://norvig.com/spell-correct.html
# http://wortschatz.uni-leipzig.de/en/download
import re
def words(text): return re.findall(r'\w+', text.lower())
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return WORDS[word] / N
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def autocorrectWord(word):
try:
return correction(word)
except:
return word
############# stringcleaning
def clean(stringstream):#, NOUNS):
def clean(stringstream,autocorrect=False):
#NOUNS = [n.lower() for n in NOUNS]
for string in stringstream:
# fixUnicode
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
string = textacy.preprocess.fix_bad_unicode(string)
#string = textacy.preprocess.unidecode(string)
# seperate_words_on_regex:
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|</>]').split(string))
#normalize whitespace
string = textacy.preprocess.normalize_whitespace(string)
@ -106,123 +47,112 @@ def clean(stringstream,autocorrect=False):
#remove linebreaks
string = re.sub(r'[\n]', " ", string)
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
string = replaceRockDots(string)
"""
# fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden.
#for n in nouns:
# string = string.replace(n.lower(),n)
#string = multisub(nouns_tuples,string)
#https://stackoverflow.com/questions/10968558/python-re-sub-with-a-list-of-words-to-find
#string = re.sub(r'[\n]', " ", string)
#string = string.replace(noun,noun.title()) for noun in nouns
splitted = string.split()
for i,s in enumerate(splitted):
if s in NOUNS:
splitted[i] = s.title()
if i != 0:
for punct in ":.!?":
if punct in splitted[i - 1]:
splitted[i] = s.title()
string = " ".join(splitted)
"""
#frage autocorrect? idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
if autocorrect:
string = " ".join([autocorrectWord(word) for word in string.split()])
yield string
def processDictstream(dictstream, funcdict, parser):
"""
:param dictstream: dict-gen
:param funcdict:
clean_in_meta = {
"Solution":funclist,
...
}
:param parser: spacy-parser
:return: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
if key in funcdict:
doc = parser(value)
tokens = [tok for tok in doc]
funclist = funcdict[key]
tokens = filterTokens(tokens, funclist)
result[key] = " ".join([tok.lower_ for tok in tokens])
else:
result[key] = value
yield result
##################################################################################################
ressources_path = FILEPATH + "ressources/"
path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
autocorrect = config.getboolean("preprocessing", "autocorrect")
def cleanCorpus(corpus):
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
def cleanCorpus(corpus_path, clean_in_meta, lang="de", printrandom=10,autocorrect=False):
autocorrect = False #todo STELLSCHRAUBE
ressources_path = FILEPATH + "ressources/"
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
#NOUNS = load_obj(path2nouns_list)
#noun_disjunction = '|'.join(NOUNS)
#nouns_tuples = []
#for n in NOUNS:
# nouns_tuples.append((n.lower(),n))
logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))
rawCorpus_name = lang + "_raw_ticket"
cleanCorpus_name = lang + "_clean_ticket"
#load raw corpus and create new one
raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)
clean_corpus = textacy.Corpus(parser)
cleanCorpus_name = corpus.lang + "_clean"
## process and add files to textacy-corpi,
clean_corpus.add_texts(
clean(corpus2Text(raw_corpus),autocorrect=autocorrect),
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
raw_corpus = corpus
parser = corpus.spacy_lang
# Actually clean the corpus
cleaned_corpus = textacy.Corpus(parser)
cleaned_corpus.add_texts(
clean(corpus2Text(raw_corpus)),
corpus2Meta(raw_corpus)
)
# leere docs aus corpi kicken
clean_corpus.remove(lambda doc: len(doc) == 0)
cleaned_corpus.remove(lambda doc: len(doc) == 0)
for i in range(printrandom):
printRandomDoc(clean_corpus)
#save corpus
save_corpus(corpus=clean_corpus, corpus_path=corpus_path, corpus_name=cleanCorpus_name)
save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name)
return clean_corpus
return cleaned_corpus
def main():
def main(corpus):
start = time.time()
WORDS = load_obj(path2wordsdict)
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
"Subject": [removePOS(["SPACE", "PUNCT"])],
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
corpus = cleanCorpus(corpus_de_path, clean_in_meta, "de",printrandom=5, autocorrect=autocorrect )
cleaned_corpus = cleanCorpus(corpus)
end = time.time()
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))
return cleaned_corpus
if __name__ == "__main__":
main()
corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/",
corpus_name="de_raw")
main(corpus)

View File

@ -14,6 +14,7 @@ pickle_file=lemma_dict.pkl
[nouns]
input=de_nouns.txt
input1=nomen.txt
input2=nomen2.txt
pickle_file=nouns_list.pkl
@ -41,6 +42,7 @@ filename=log/topicModelTickets.log
[de_corpus]
#input=M42-Export/Tickets_mini.csv
#input=M42-Export/Tickets_small.csv
input=M42-Export/de_tickets.csv

View File

@ -23,10 +23,11 @@ with open(config_ini) as f:
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
def ticketcsv_to_textStream(path2csv, content_collumn_name):
"""
:param path2csv: string
:param content_collumn_name: string
:return: string-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
@ -42,28 +43,32 @@ def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
yield lst[content_collumn]
def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
def ticket_csv_to_DictStream(path2csv,content_collumn_name):
"""
:param path2csv: string
:param metalist: list of strings
:param content_collumn_name: string
:return: dict-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
metaindices = []
metadata_temp = {}
metalist = []
metadata_template = {}
for i, lst in enumerate(stream):
if i == 0:
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
for key in metalist:
if re.sub('[^a-zA-Z]+', '', key) == re.sub('[^a-zA-Z]+', '', col):
metaindices.append(j)
metadata_temp = dict(
for j, col in enumerate(lst):
if "icketNumb" in col:
col = "TicketNumber"
metalist.append(str(col))
metaindices.append(j)
metadata_template = dict(
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
else:
metadata = metadata_temp.copy()
metadata = metadata_template.copy()
for key, value in metadata.items():
metadata[key] = lst[value]
yield metadata
@ -75,19 +80,16 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
content_collumn_name = config.get("tickets","content_collumn_name")
metaliste = get_list_from_config("tickets","metaliste")
path2de_csv = FILEPATH + config.get("de_corpus","input")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
path2en_csv = FILEPATH + config.get("en_corpus","input")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0):
# print paths
@ -102,36 +104,37 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
## add files to textacy-corpi,
raw_corpus.add_texts(
ticketcsv_to_textStream(path2_csv, content_collumn_name),
ticket_csv_to_DictStream(path2_csv, metaliste)
ticket_csv_to_DictStream(path2_csv,content_collumn_name)
)
# leere docs aus corpi kicken
raw_corpus.remove(lambda doc: len(doc) == 0)
logprint("corpus-lenght: {}".format(len(raw_corpus)))
#random Doc printen
for i in range(printrandom):
printRandomDoc(raw_corpus)
logprint("corpus-length: {}".format(len(raw_corpus)))
# save corpus
raw_name = lang + "_raw_ticket"
raw_name = lang + "_raw"
save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
logprint("Done")
return raw_corpus
def main():
start = time.time()
ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de")
#ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en")
raw_corpus = ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,lang="de")
end = time.time()
logprint("Time Elapsed Corporization:{0} min".format((end - start) / 60))
return raw_corpus
if __name__ == "__main__":

17
init.py
View File

@ -81,7 +81,8 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
if elem.tag == "LexicalEntry":
lex_dictlist = [subentry.attrib for subentry in elem]
# idee technischer thesaurus
# idee hauptsynonmy muss einzelnes wort sein
synlist = []
string = "WORD"
@ -187,8 +188,8 @@ def create_stopword_lists(*paths):
de_stop_words4 = list_from_files(*de_filepaths)
#combine everything
de_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
de_stop_words = list(set(map(replaceRockDots_lambda(), list(map(textacy.preprocess.normalize_whitespace,
de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
@ -210,8 +211,8 @@ def create_stopword_lists(*paths):
# combine everything
en_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))
en_stop_words = list(set(map(replaceRockDots_lambda(), list(map(textacy.preprocess.normalize_whitespace,
en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))
return de_stop_words, en_stop_words
@ -252,6 +253,7 @@ path2lemma_file = ressources_path + config.get("lemmatization","input")
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
# NOMEN
nouns0 = ressources_path + config.get("nouns","input")
nouns1 = ressources_path + config.get("nouns","input1")
nouns2 = ressources_path + config.get("nouns","input2")
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
@ -276,7 +278,7 @@ def main():
logprint("Init: {0}".format(datetime.now()))
""""""
logprint("create and save lemma_dict")
lemma_dict = create_lemma_dict(path2lemma_file)
save_obj(lemma_dict, path2lemmadict)
@ -303,7 +305,8 @@ def main():
logprint("Build and save nomenliste")
nouns = list_from_files(nouns1,nouns2)
#nouns = list_from_files(nouns1,nouns2)
nouns = list_from_files(nouns0)
save_obj(nouns, path2nouns_list)

63
main.py
View File

@ -34,34 +34,53 @@ start = time.time()
logprint("main.py started at {}".format(datetime.now()))
#init.main()
logprint("")
raw_corpus = corporization.main()
logprint("")
cleaned_corpus = cleaning.main(raw_corpus)
logprint("")
pre_corpus = preprocessing.main(cleaned_corpus)
logprint("")
"""
init.main()
logprint("")
ticket_number = "INC40484"
raw=""
pre=""
clean=""
for r in raw_corpus.get(lambda doc: doc.metadata["TicketNumber"] == ticket_number):
raw = r
for c in cleaned_corpus.get(lambda doc: doc.metadata["TicketNumber"] == ticket_number):
clean = c
for p in pre_corpus.get(lambda doc: doc.metadata["TicketNumber"] == ticket_number):
pre = p
corporization.main()
logprint("")
for tok1,tok2,tok3 in zip(raw,clean,pre):
cleaning.main()
logprint("")
preprocessing.main()
logprint("")
logprint(tok1.text,tok1.pos_)
logprint(tok2.text,tok2.pos_)
logprint(tok3.text,tok3.pos_)
"""
#for i in range(5):
# printRandomDoc(cleaned_corpus)
"""
#topicModeling.main(algorithm="lsa")
@ -71,12 +90,12 @@ logprint("")
#topicModeling.main(algorithm="nmf")
logprint("")
#topicModeling.main(algorithm="llda")
"""
topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda")
logprint("")
topicModeling.main(algorithm="lda")
topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda")
logprint("")

View File

@ -97,10 +97,26 @@ def load_obj(path):
return pickle.load(f)
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss",
(re.sub(r'[ö]', "oe",
(re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
def replaceRockDots_lambda():
return lambda string : re.sub(r'[ß]', "ss",
(re.sub(r'[ö]', "oe",
(re.sub(r'[Ö]', "Oe",
(re.sub(r'[ü]', "ue",
(re.sub(r'[Ü]', "Ue",
(re.sub(r'[ä]', "ae",
(re.sub(r'[Ä]', "Ae",
string)))))))))))))
def replaceRockDots(string):
return re.sub(r'[ß]', "ss",
(re.sub(r'[ö]', "oe",
(re.sub(r'[Ö]', "Oe",
(re.sub(r'[ü]', "ue",
(re.sub(r'[Ü]', "Ue",
(re.sub(r'[ä]', "ae",
(re.sub(r'[Ä]', "Ae",
string)))))))))))))
def list_from_files(*paths):
"""
@ -201,6 +217,7 @@ def save_corpus(corpus, corpus_path, corpus_name):
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
"""
#todo pos und ner tagging speichern
# save parser
parser = corpus.spacy_lang
@ -219,7 +236,13 @@ def gen_dicts(corpus):
dict.update(doc.metadata)
yield dict
def multisub(subs, subject):
#https://stackoverflow.com/questions/764360/a-list-of-string-replacements-in-python
"Simultaneously perform all substitutions on the subject string."
pattern = '|'.join('(%s)' % re.escape(p) for p, s in subs)
substs = [s for p, s in subs]
replace = lambda m: substs[m.lastindex - 1]
return re.sub(pattern, replace, subject)
def load_corpus(corpus_path, corpus_name, lang="de"):
"""

View File

@ -18,6 +18,7 @@ FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = FILEPATH + "config.ini"
ressources_path = FILEPATH + "ressources/"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
@ -29,6 +30,13 @@ with open(config_ini) as f:
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
global THESAURUS
global WORDS
global LEMMAS
global NOUNS
global VORNAMEN
global DE_STOP_WORDS
global EN_STOP_WORDS
THESAURUS = {}
WORDS= {}
@ -126,17 +134,26 @@ def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
print(word)
return word
def getFirstSynonym(word, thesaurus=THESAURUS):
def getFirstSynonym(word, thesaurus=THESAURUS,n=3):
for i in range(n):
try:
word = thesaurus[word.lower()] if word.lower() in thesaurus.keys() else word.lower()
except:
print(word)
return word
"""
if not isinstance(word, str):
return str(word)
word = word.lower()
if word in thesaurus.keys():
return thesaurus[word]
else:
return str(word)
"""
########################## Spellchecking ##########################################
@ -286,12 +303,13 @@ def processDictstream(dictstream, funcdict, parser):
##################################################################################################
ressources_path = FILEPATH + "ressources/"
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file")
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
@ -309,60 +327,136 @@ de_plainpath = FILEPATH + config.get("de_corpus", "path") + "pre_labled_lines.tx
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
def extract_from_corpus(corpus):
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
WHITELIST = ["boss", "sap", "firefox"] #todo autogenerierung relv. techn. begriffe
logprint("Preprocess {0}_corpus at {1}".format(lang, datetime.now()))
kb_cats = ['eldorado', 'cws_confluence', 'wsus', 'mail groupware', 'd.3 dms', 'serviceportal', 'softwarelizenzen', 'sophos', 'webserver', 'sap', 'ftp server', 'dhcp', 'tonerboerse', 'mailalias', 'arbeitsplatzsupport', 'mediendienste', 'mailverteiler', 'uni mail', 'basis app', 'videoschnitt', 'DEFAULT', 'verwaltung', 'matrix42_hilfe', 'hoersaaluebertragung', 'redmine', 'uniflow', 'keine rueckantwort', 'pools', 'leitung', 'netze', 'konteneinsicht', 'kennwort aenderung', 'datanet', 'neuanschluss', 'semesterticket', 'asknet', 'veranstaltungen', 'housing', 'fk 16', 'fiona', 'betrieb', 'vorlagenerstellung', 'studierendensekretariat', 'pvp', 'mobilfunkvertraege', 'ausleihe', 'web', 'spam phishing', 'sap urlaub', 'evaexam', 'vorlesungsaufzeichnung', 'firewall betreuung', 'ub basis it', 'virtuelle desktops citrix', 'fk15', 'virtuelle server', 'lizenzserver', 'elektronisches telefonbuch', 'joomla itmc website', 'weiterentwicklung', 'serversupport', 'wlan', 'kurse', 'technik', 'raumkalender', 'backup tsm', 'haustechnik', 'voicemail box', 'facility', 'unicard ausgabe', 'mdm mobile device management', 'entwicklung', 'webgestaltung', 'unicard sperrung', 'forensic', 'basis applikationen', 'overhead projektor', 'plagiatserkennung', 'uniaccount zugangsdaten', 'zentrale webserver', 'webmailer', 'fk12 webauftritt', 'plotter', 'campus management', 'ub_stoerungen', 'rundmail', 'telefon', 'raumbuchung', 'fk12 migration', 'dienstreise', 'hardware', 'it sicherheit sic', 'hochleistungsrechnen', 'unicard', 'sos', 'benutzerverwaltung_probleme', 'confluence', 'vpn', 'zhb', 'campus app', 'itmc_aufgaben', 'sicherheit', 'schulungsraum verwaltung', 'unicard produktion', 'schulung', 'video', 'dokoll support', 'sd', 'servicedesk', 'v2 campus app feedback', 'lido', 'app feedback', 'ibz raumbuchung', 'hcm stammdaten', 'itmc_stoerungen', 'boss service desk', 'exchange nutzung', 'office', 'rektorat -buero', 'bestellung', 'moodle', 'fk raumplanung 09', 'aenderung', 'neuausstattung', 'benutzerverwaltung', 'rechnerraeume', 'designentwicklung', 'fk 12', 'werkstoffe lehrstuhl bauwesen', 'server storage', 'beantragung', 'visitenkartenproduktion', 'gastaufenthalt', 'telefonkonferenzen', 'raumbuchungssysteme', 'fk14_test', 'e mail dienste', 'grafik', 'ews', 'itmc schulungsraeume', 'tsm', 'softwareverteilung', 'beamer', 'lizenzmanagement', 'fileserver einrichtung', 'redmine projektverwaltung', 'service desk itmc', 'pruefungsmanagement', 'prozess- und projektmanagement', 'formulare antraege', 'namensaenderung', 'verkauf', 'software', 'itmc medienraeume ef50', 'zugangsdaten', 'medientechnik', 'lan', 'veeam', 'unicard redaktionsteam', 'changes', 'service portal', 'limesurvey', 'dns', 'dokoll pvp', 'uhren', 'nrw ticket', 'itmc_als', 'linux bs', 'werkvertraege', 'blogs wikis foren', 'test', 'abmeldung', 'desktop & basisdienste', 'telefonzentrale', 'siport zugangskontrolle', 'antrag auf rechnungserstellung', 'verschiedene aufgaben', 'kundenserver', 'medienraeume ef50', 'videokonferenzen', 'benutzungsverwaltung', 'mailverteiler exchange', 'lsf', 'telefonabrechnung', 'werkstaette', 'uniaccount', 'outlook_einrichtung', 'itmc webauftritt', 'zertifikate server dfn', 'allgemein', 'umzug', 'service portal redaktion', 'pos', 'beschaffung', 'boss', 'hacker angriff', 'software entwicklung', 'cd dvd produktion', 'sam spider', 'viren', 'kursplanung', 'itmc pools', 'kms', 'e learning']
kb_keys = ['zugriff_onlinedienste_rueckmeldung', 'uniaccount', 'freischaltung', 'asknet', 'eduroam', 'donnerstagsmail namensaenderung', 'asiexception', 'lsf', 'kundenantwort', 'chip', 'unitymedia', 'citavi', 'fehler', 'windows beziehen', 'wlan', 'ipv6', 'freischaltung verzoegert', 'betrag', '"defekte karte"', 'risse', 'laden', 'sap portal anderer modus', 'goeke', 'informationen des itmc zum einsatz', 'transport wurde durchgefuehrt.', 'wi-fi', 'unicard_auszahlung', 'ausleihe', 'unimail', 'uni-account', 'unicard','beantragung', 'nrw-ticket', 'printservice', 'dms', 'ip6', 'transport und beschreibung zum transportauftrag !', 'wlan passwort', 'dokumentenmanagementsystem', 'webmailer', 'vpn', 'repository', 'unicard', 'projekte', 'eingeschrieben', 'unicard abholung oeffnungszeiten', 'd3', 'beantragung', 'app tu-dortmund feedback', 'semester ticket', 'redmine', 'git', 'geldkarte', 'outlook_exchange', 'spam standardmeldung phishing', 'automatische aktualisierung der selbst angelegten kontakte in outlook', '"beschaedigte unicard"', 'elektronische telefonbuch', 'boss', 'wwrite', 'DEFAULT', 'anyconnect', 'wifi']
kb_subjs =['sd_office 365 plus support', 'citavi_lizenzschluessel_nicht bekommen', 'uni card', 'sd_office 356 plus bestellung', 'sd_gastaufenthalter', 'sd_outlook kontakte automatische aktualisierung', 'benutzer zum redmine hinzufuegen', 'sd_matlab lizenzdatei pc-pools', 'sd_tu-app feedback standard', 'vpn_ipsec_stoerung', 'vpn verbindung fuer unitymedia kunden', 'ub_prod_abholung_ abholfristen_benachrichtigungen', 'einrichtung des eduroam netzwerks', 'sd_webmailer_threadanzeige und weiterleitung', 'sd_wlan passwort setzen', 'ub_prod_namenskorrektur_student', 'sd_unimail imap_pop3', 'sd_outlook_in_exchange_einbinden', 'sd_keine rueckantwort kunde', 'sd_asknet_und_dreamspark', 'sd_heirat_namensaenderung_student', 'bd_unicard_nicht_eingeschrieben', 'wlan', 'sd_telefonbuch_prof_eintragung', 'change produktiv nehmen chn00146 - transport e01k909284', 'ungueltiges ticket siehe journal', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'd.3 client installation', 'unicard_restbetrag_auszahlung', 'cm_asiexception', 'sd_origin_workaround', 'sd_vpn_aktualisierung', 'problem mit der beantragung von der unicard', 'sd_unicard fehlerhafte geldbuchung', 'sd_login tu portals english', 'sd_gmx_web.de', 'studierendenausweis', 'sd_citavi', 'sd_fk9 test', 'sd_webmailer_thread-anzeige', 'bd_unicard_geldkarte_laden', 'ub_unicard_unicard mit vollmacht abholen', 'sd_stellenausschreibung schwarzes brett', 'freischaltung uniaccount', 'sd_asknet_mitarbeiter_softwarebestellung', 'how to setup eduroam', 'sd_citavi bestellung', 'unicard vergessen abzuholen und nicht mehr da', 'sd_unimail zu exchange', 'sd_diensthandy beschaffung', 'sd_sap konteneinsicht antrag', 'sd_unicard_defekt', 'sd_webmailer einrichtung weiterleitung', 'sd_kurs-angebote anmeldung', 'm42_dokumentationen_zu_neuen_ous', 'sd_sap_initialkennwort', 'sd_sap_freischaltung ohne passwortaenderung', 'sd_telefonbuch-eintrag_aenderung', 'sd_pruefungsamt', 'sd_phishing', 'apps_dms-passwort d.3', 'sd_goeke drucker', 'sd_sap_dienstreise', 'unicard nochmal beantragen', 'sd_outlook anmeldung gestoert', 'sd_citavi_support', 'DEFAULT', 'sd_geraeteausleihe', 'sd_account_abmelden', 'sd_uniaccount freischaltung verzoegert englisch', 'ub_beschaedigte unicard', 'sd_gleitzeitanlage_dez3_stoerung', 'transportdurchfuehung', 'sd_sap_initialkennwort_englisch', 'sd_antwort_phishingmail', 'sd_namensaenderung mitarbeiter', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'lsf freischaltung als mitarbeiter/in', 'ub_unicard_spaetere abholung moeglich?', 'sd_antrag funktionale mailadresse', 'sd_apple-on-campus', 'sd_office365_asknet', 'sd_sophos download', 'sd_freischaltung uniaccount verzoegert', 'ub_unicard_zusendung der karte moeglich?', 'ohne betreff', 'sd_immatrikulationsbescheinigung_portal', 'sd_studisek_buchung_semesterbeitrag', 'sd_studisek_englisch', 'probleme mit der namensaenderung/ neue unicard', 'sd_telefonbuch, neues system', 'fehlender eintrag im elektronischen telefonbuch', 'sd_boss_notenverbuchung', 'sd_laufzeit unimail account', 'sd_semesterticket', 'sd_kontakt_asknet', 'windows 10', 'sd_login_tu_portale', 'ub_geldchip-problem bei uc', 'sd_zugriff_onlinedienste_rueckmeldung', 'sd_wlan-gastkonto', 'sd_tu_app_keine internetverbindung', 'sd_uniaccount_ehemalige_passwortaenderung', 'sd_verlust/antrag unicard', 'sd_sap_konteneinsicht_ workaround', 'apps_redmine_repository', 'sd_itmc kurse anmeldebestaetigung', 'sd_mail_als_anhang', 'bd_unicard_chip_defekt', 'probleme mit unicard', 'ub_unicard_abholungszeiten', 'sd_falsche_personendaten', 'sd_uniaccount_ehemalige_studierende', 'sd_vpn anleitungen', 'sd_kurs-angebote itmc', 'sd_studisek', 'sd_login tu portale', 'sd_immatrikulationsbescheigung_druckfehler', 'ub_drucker kopierer', 'sd_vpn_temporaerer fehler ub', 'sd_spss_online_bestellung', 'sd_dreamspark', 'sd_unicard_gesperrte unicard entsperre', 'sd_boss-bescheinigung', 'bd_goeke_allgemein', 'sd_uniaccount_passwortaenderung', 'sd_namensaenderung_englisch', 'sd_email_namensaenderung', 'bd_unicard_freigabe_beantragung', 'spam ohne tu bezug', 'sd_internationaloffice', 'sd_tu-app feedback_englisch', 'cm_lsf-boss_freischaltung', 'sd-e-mail_adresse_funktional_beantragen', 'sd_vpn_webvpn', 'sd_vpn_probleme_mit_unitymedia', 'sd_plotauftrag_zv', 'sd_beantragung_unicard', 'sd_antworten_korrekt', 'ub_prod_neue unicard bei beschaedigung', 'sd_telefonantrag_aenderung_neuantrag', 'sd_wlan passwort englisch', 'sd_aktivierung uniaccount', 'sd_spam e-mail bekannt meldung', 'sd_wlan_beratung', 'ub_namensaenderung', 'sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)', 'sd_unicard_abholung', 'sd_uniaccount_dauer freischaltung', 'sd_uniaccount activation englisch', 'sd_unicard_max_laufzeit', 'sd_unicard_workaround_bestellung', 'sd_sap_firefox_esr', 'sap portal "im anderen modus geoeffnet"', 'sd_origin nur noch eine seriennummer', 'sd_login_unibib ub-it']
cleanCorpus_name = lang + "_clean_ticket"
preCorpus_name = lang + "_pre_ticket"
WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs
THESAURUS = load_obj(path2thesaurus_dict)
#WORDS = load_obj(path2wordsdict)
LEMMAS = load_obj(path2lemmadict)
DE_STOP_WORDS = load_obj(path2DEstopwordlist)
#EN_STOP_WORDS = load_obj(path2ENstopwordlist)
VORNAMEN = load_obj(path2firstnameslist)
for doc in corpus:
result = []
#if doc.metadata["TicketNumber"] == "INC40506":
# breakpoint()
for tok in doc:
if tok.lower_ =="boss" or tok.lower_ =="sap":
print(tok.lower_+": "+tok.ent_type_)
if tok.lower_ in WHITELIST:
result.append(tok.lower_)
# ignore header, urls , emails, stop, vornamen
lemmatized_word = lemmatizeWord(tok.text,lemma_dict=LEMMAS)
if lemmatized_word.lower() in ["sehr", "geehrt", "herr" ,"herrn", "herren", "dame" , "damen", "liebe","lieben", "hallo", "guten", "tag","ehre","hi"] \
or tok.like_url \
or tok.like_email \
or tok.is_stop \
or tok.is_punct \
or tok.lower_ in DE_STOP_WORDS \
or tok.lower_ in VORNAMEN:
continue
# cut after footer
if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]: # fehler schneidet bei INC40506 das meiste weg
break
# boss/SAP ent_type = 'ORG' oder '' (ein-weimal LOC oder PERSON)
if tok.pos_ in ["NOUN"] \
or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART"]:
#or tok.dep_ == "ROOT":
# or tok.lower_ in NOUNS \ #,"PERSON"] \
toktext = tok.lower_
toktext = lemmatized_word
"""
first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS)
if first_synonym is not None:
toktext = first_synonym if len(first_synonym.split()) == 1 else toktext
"""
result.append(toktext)
yield " ".join(result)
def preprocessCorpus(corpus, clean_in_meta):
logprint("Preprocess {0}_corpus at {1}".format(corpus.lang, datetime.now()))
preCorpus_name = corpus.lang + "_pre"
clean_corpus = corpus
parser = corpus.spacy_lang
pre_corpus = textacy.Corpus(parser)
#load raw corpus and create new one
logprint("Load {0}_raw".format(lang))
clean_corpus, parser = load_corpus(corpus_name=cleanCorpus_name, corpus_path=corpus_path)
corpus = textacy.Corpus(parser)
## process and add files to textacy-corpi,
corpus.add_texts(
processContentstream(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
pre_corpus.add_texts(
#processContentstream(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
extract_from_corpus(clean_corpus),
processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser)
)
# leere docs aus corpi kicken
corpus.remove(lambda doc: len(doc) == 0)
for i in range(printrandom):
printRandomDoc(corpus)
pre_corpus.remove(lambda doc: len(doc) == 0)
#save corpus
save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name)
save_corpus(corpus=pre_corpus, corpus_path=corpus_de_path, corpus_name=preCorpus_name)
#save corpus as labled, plain text
savelabledCorpiLines(corpus, de_plainpath)
savelabledCorpiLines(pre_corpus, de_plainpath)
labled_lines =""
return pre_corpus
return corpus
def main():
def main(corpus):
start = time.time()
THESAURUS = load_obj(path2thesaurus_dict)
WORDS = load_obj(path2wordsdict)
LEMMAS = load_obj(path2lemmadict)
DE_STOP_WORDS = load_obj(path2DEstopwordlist)
EN_STOP_WORDS = load_obj(path2ENstopwordlist)
NOUNS = load_obj(path2nouns_list)
VORNAMEN = load_obj(path2firstnameslist)
"""
filter_tokens = [
keepNouns(NOUNS),
@ -376,8 +470,7 @@ def main():
#todo STELLSCHRAUBE remove_short_words()
]
"""
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
"Subject": [removePOS(["SPACE", "PUNCT"])],
@ -385,15 +478,30 @@ def main():
}
corpus = preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de",printrandom=5)
pre_corpus = preprocessCorpus(corpus, clean_in_meta)
#for i in range(5):
# printRandomDoc(pre_corpus)
end = time.time()
logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
return pre_corpus
if __name__ == "__main__":
main()
corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/",corpus_name="de_clean")
main(corpus)

411
test.py

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import matplotlib
matplotlib.use('Agg')
from datetime import datetime
import draw
import draw1
@ -20,6 +21,9 @@ from scipy import *
import os
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
@ -109,8 +113,24 @@ def textacyTopicModeling(corpus,
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
def jgibbsLLDA(labeldict,line_gen,path2save_results, top_topic_words=7):
#labeldict = {k : labelist.count(k) for k in labelist}
#max=0
#for v in labeldict.values():
# max = v if v > max else max
#labelist = sort_dictionary(labeldict)
#labeldict.update({'DEFAULT' : max+1})
labeldict_rev = {v: k for k, v in labeldict.items()}
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
@ -246,12 +266,30 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
category = normalize(doc.metadata["categoryName"])
labelist.append(category)
# frage nur die x häufigsten labels benutzen, rest raus?
labelist = [l for l in labelist if labelist.count(l) > 50 ]
labelist = list(set(labelist))
#print("len(labelist): {}".format(len(labelist)))
in_labelist_ = {k: labelist.count(k) for k in labelist}
labelist = sort_dictionary(in_labelist_)
labelist.reverse()
labeldict = {elem[0] : i for i, elem in enumerate(labelist)}
#for elem in labelist:
# l = elem[0]
# c = elem[1]
#labeldict = {elem[0] : len(labelist)-(i+1) for i, elem in enumerate(labelist)}
#labelist = list(set(labelist))
#labeldict = {k: v for v, k in enumerate(labelist)}
labeldict.update({'DEFAULT': len(labelist)})
labeldict = {k: v for v, k in enumerate(labelist)}
labeldict.update({'DEFAULT' : len(labeldict)})
def gen_cat_lines(textacyCorpus, labeldict):
@ -260,10 +298,9 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
for doc in textacyCorpus:
label = labeldict.get(normalize(doc.metadata["categoryName"]), labeldict['DEFAULT'])
# frage nur die x häufigsten labels benutzen, rest raus?
yield "[ " + str(label) + " ] " + doc.text
if label is not 'DEFAULT':
yield "[ " + str(label) + " ] " + doc.text
line_gen = gen_cat_lines(corpus, labeldict)
@ -602,26 +639,48 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
logprint("\n\n\nTime Elapsed LLDA :{0} min\n\n".format((end - start) / 60))
def load_from_labled_lines(path):
path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/pre_labled_lines_wo_lemma_061217.txt"
#idee plan
# clean laden, pre laden
# unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee notfalls bigramme als geklammerte "wörter"
# nimm nur ngrams wo midn. ein token in pre vorkommt
def main( algorithm="llda"):
def main(cleaned_corpus, pre_corpus, algorithm="llda"):
logprint("Topic Modeling: {0}".format(datetime.now()))
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
preCorpus_name = "de" + "_pre_ticket_old"
preCorpus_name = "de" + "_pre_ticket"
#todo von labled_lines laden ??
#idee thesaurus vor id2term
#todo akronyme & abk. drin lassen
#todo bigramme nicht auf pre, sondern auf cleaned
#todo zahlen drin lassen, bigramme: NUM wort kombis
#todo levenstein/hamming distanz statt autokorrekt #idee oder word2vec
#todo ticket-subj mit einbeziehen
resultspath = FILEPATH + "results/pre"
# load corpus
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(de_corpus.lang))
de_corpus = pre_corpus
if algorithm == "llda":
top_topic_words = 5
top_topic_words = 3
jgibbsLLDA_category(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words)
@ -712,7 +771,20 @@ def main( algorithm="llda"):
if __name__ == "__main__":
main()
# load corpus
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
pre_corpus_name = "de" + "_pre"
pre_corpus, parser = load_corpus(corpus_name=pre_corpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(pre_corpus_name))
cleaned_corpus_name = "de" + "_raw"
#cleaned_corpus, parser = load_corpus(corpus_name=cleaned_corpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(cleaned_corpus_name))
cleaned_corpus = None
main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda")
main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda")

View File

@ -367,7 +367,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=Fals
keywords = lino[2]
keywords_list = [x.lower().strip() for x in map(replaceRockDots(),str(keywords).split(","))]
keywords_list = [x.lower().strip() for x in map(replaceRockDots_lambda(), str(keywords).split(","))]
if kb_keywords:
for item in keywords_list: