@ -11,6 +11,9 @@ from scipy import *
import os
from preprocessing import removePOS
from preprocessing import filterTokens
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
@ -24,11 +27,6 @@ with open(config_ini) as f:
global WORDS
REGEX_SPECIALCHAR = r'[`\=~%^&*()_+\[\]{};\'"|</>]' #+r',.-\\:' #+r',.?!'
@ -113,15 +111,12 @@ def clean(stringstream,autocorrect=False):
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# frage autocorrect?
#frage autocorrect? idee
if autocorrect:
string = " ".join([autocorrectWord(word) for word in string.split()])
yield string
def processDictstream(dictstream, funcdict, parser):
@ -154,30 +149,21 @@ def processDictstream(dictstream, funcdict, parser):
result[key] = value
yield result
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
return tokens
def removePOS(pos_list):
return lambda tok: tok.pos_ not in pos_list
ressources_path = FILEPATH + "ressources/"
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
autocorrect = config.getboolean("preprocessing", "autocorrect")
def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
def cleanCorpus(corpus_path, clean_in_meta, lang="de", printrandom=10,autocorrect=False):
logprint("Clean {0}_corpus at {1}".format(lang,
@ -192,7 +178,7 @@ def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrando
## process and add files to textacy-corpi,
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
@ -220,8 +206,6 @@ def main():
WORDS = load_obj(path2wordsdict)
clean_in_content = [] #frage notwendig?
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
@ -229,7 +213,7 @@ def main():
"categoryName": [removePOS(["SPACE", "PUNCT"])]
corpus = cleanCorpus(corpus_de_path, clean_in_content, clean_in_meta, "de",printrandom=5 )
corpus = cleanCorpus(corpus_de_path, clean_in_meta, "de",printrandom=5, autocorrect=autocorrect )
end = time.time()
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))

@ -64,7 +64,10 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
autocorrect = false

View File

@ -23,8 +23,6 @@ with open(config_ini) as f:
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
:param path2csv: string
@ -75,27 +73,9 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
content_collumn_name = "Description"
metaliste = [
content_collumn_name = config.get("tickets","content_collumn_name")
metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(",")))
metaliste = get_list_from_config("tickets","metaliste")
path2de_csv = FILEPATH + config.get("de_corpus","input")
@ -110,7 +90,6 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path")
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
# print paths
path_csv_split = path2_csv.split("/")
filename = path_csv_split[len(path_csv_split) - 1]
@ -121,8 +100,6 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
raw_corpus = textacy.Corpus(lang)
## add files to textacy-corpi,
#printlog("Add texts to {0}_textacy-corpi".format(lang))
ticketcsv_to_textStream(path2_csv, content_collumn_name),
ticket_csv_to_DictStream(path2_csv, metaliste)
@ -132,6 +109,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
# leere docs aus corpi kicken
raw_corpus.remove(lambda doc: len(doc) == 0)
logprint("corpus-lenght: {}".format(len(raw_corpus)))
#random Doc printen
for i in range(printrandom):

@ -237,36 +237,37 @@ def build_words_for_spellchecking(path2words):
path2wordnet = FILEPATH + config.get("thesaurus","input")
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
ressources_path = FILEPATH + "ressources/"
path2wordnet = ressources_path + config.get("thesaurus","input")
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
path2words_file = FILEPATH + config.get("spellchecking","input")
path2wordlist = FILEPATH + config.get("spellchecking","pickle_file")
path2words_file = ressources_path + config.get("spellchecking","input")
path2wordlist = ressources_path + config.get("spellchecking","pickle_file")
path2lemma_file = FILEPATH + config.get("lemmatization","input")
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
path2lemma_file = ressources_path + config.get("lemmatization","input")
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
nouns1 = FILEPATH + config.get("nouns","input1")
nouns2 = FILEPATH + config.get("nouns","input2")
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
nouns1 = ressources_path + config.get("nouns","input1")
nouns2 = ressources_path + config.get("nouns","input2")
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
firstnames_txt = FILEPATH + config.get("firstnames","input")
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
firstnames_txt = ressources_path + config.get("firstnames","input")
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
stop1 = FILEPATH + config.get("de_stopwords","input1")
stop2 = FILEPATH + config.get("de_stopwords","input2")
stop3 = FILEPATH + config.get("de_stopwords","input3")
path2stopwordlist_de = FILEPATH + config.get("de_stopwords","pickle_file")
stop1 = ressources_path + config.get("de_stopwords","input1")
stop2 = ressources_path + config.get("de_stopwords","input2")
stop3 = ressources_path + config.get("de_stopwords","input3")
path2stopwordlist_de = ressources_path + config.get("de_stopwords","pickle_file")
path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file")
path2stopwordlist_en = ressources_path + config.get("en_stopwords","pickle_file")

@ -11,12 +11,12 @@ import cleaning
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &"
start = time.time()
@ -30,32 +30,23 @@ logprint("")
end = time.time()
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))

View File

@ -153,6 +153,25 @@ def printRandomDoc(textacyCorpus):
def get_list_from_config(section,option):
return list(map(textacy.preprocess.normalize_whitespace,config.get(section,option).split(",")))
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def savelabledCorpiLines(corpus,filepath):
textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)
def gen_labledLines(corpus):
for doc in corpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
def save_corpus(corpus, corpus_path, corpus_name):
@ -219,95 +238,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
for key,value in plain.items():
if key != "content" and key != "index":
meta[key] = value
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def saveplaincorpustext(corpus,path):
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
def save_corpusV2(corpus, corpus_path, corpus_name):
# save parser
parser = corpus.spacy_lang
parserpath = corpus_path + str(parser.lang) + '_parser'
contentpath = corpus_path +corpus_name + "_docs/"
if not os.path.exists(contentpath):
for doc in corpus:
with open(contentpath + str(doc.corpus_index) + "_doc.bin", 'w') as f:
with open(contentpath + str(doc.corpus_index) + "_meta.json", 'w') as file:
def load_corpusV2(corpus_path, corpus_name, lang="de"):
# ckeck for language
if "de_" in corpus_name:
lang = "de"
elif "en_" in corpus_name:
lang = "en"
# load parser
parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser' + '/vocab/strings.json'
with open(stringstorepath) as file:
vocabpath = Path(corpus_path + str(lang) + '_parser' + '/vocab/lexemes.bin')
# load corpus
corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_docs/"
docs = yield_fromdir(contentpath,spacy_vocab=corpus.spacy_vocab,type="doc")
metas = yield_fromdir(contentpath,type="meta")
for doc,meta in zip(docs,metas):
textacy.Doc(doc, lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang
def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
filelist = [name for name in os.listdir('.') if os.path.isfile(name)]
filelist = [filename for filename in filelist if type in filename]
filelist.sort(key = lambda elem : elem.split("_")[0])
if type =='doc':
for filename in filelist:
with open(path+filename,'r') as f:
for bytes_string in SpacyDoc.read_bytes(f):
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
elif type == 'meta':
for filename in filelist:
with open(path+filename,'r') as f:
yield json.load(f)
for filename in filelist:
yield load_obj(path+filename)
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang

