diff --git a/config.ini b/config.ini index fdc5e2e..49394be 100644 --- a/config.ini +++ b/config.ini @@ -1,58 +1,53 @@ [thesaurus] -input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml -pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_dict.pkl +input = deWordNet.xml +pickle_file = thesaurus_dict.pkl [spellchecking] -input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt -pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_dict.pkl +input = deu_news_2015_1M-sentences.txt +pickle_file = words_dict.pkl [lemmatization] -input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt -pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl +input = lemmas.txt +pickle_file = lemma_dict.pkl [nouns] -input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt -input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt -pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl +input1 = nomen.txt +input2 = nomen2.txt +pickle_file = nouns_list.pkl [firstnames] -input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt -pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl +input = firstnames.txt +pickle_file = firstnames_list.pkl [de_stopwords] -input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt -input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt -input3 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt -pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl +input1 = de_stopwords_1.txt +input2 = de_stopwords_2.txt +input3 = de_stopwords_3.txt +pickle_file = stopwords_list.pkl [logging] level = INFO -filename = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log +filename = topicModelTickets.log [de_corpus] -#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv -#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv -#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv -input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv - -path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ -raw = de_raw_ticket -pre = de_pre_ticket +#input = M42-Export/Tickets_med.csv +#input = M42-Export/Tickets_small.csv +#input = M42-Export/Tickets_mini.csv +input = M42-Export/de_tickets.csv +path = corpi/ [en_corpus] -input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv +input = M42-Export/en_tickets.csv -path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ -raw = en_raw_ticket -pre = en_pre_ticket +path = corpi/ diff --git a/corporization.py b/corporization.py index c2f69f1..50cba8e 100644 --- a/corporization.py +++ b/corporization.py @@ -1,53 +1,20 @@ # -*- coding: utf-8 -*- -from datetime import datetime - -import time -import logging -from stop_words import get_stop_words - -#import words as words -from nltk.corpus import stopwords as nltk_stopwords -from collections import Counter import csv -import re -import xml.etree.ElementTree as ET -import spacy -import textacy -from scipy import * import sys -csv.field_size_limit(sys.maxsize) -import pickle -import configparser as ConfigParser from miscellaneous import * - - - - - - - import time - - - - from datetime import datetime -import logging -from nltk.corpus import stopwords -import csv -import functools import re -import xml.etree.ElementTree as ET -import spacy import textacy from scipy import * -import sys -csv.field_size_limit(sys.maxsize) +import os +csv.field_size_limit(sys.maxsize) +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config -config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" +config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: @@ -128,70 +95,61 @@ metaliste = [ content_collumn_name = config.get("tickets","content_collumn_name") -metaliste = config.get("tickets","metaliste") +metaliste = config.get("tickets","metaliste").split(",") -path2de_csv = config.get("de_corpus","input") -corpus_de_path = config.get("de_corpus", "path") -raw_de_name = config.get("de_corpus", "raw") +path2de_csv = FILEPATH + config.get("de_corpus","input") +corpus_de_path = FILEPATH + config.get("de_corpus", "path") -path2en_csv = config.get("en_corpus","input") -corpus_en_path = config.get("en_corpus", "path") -raw_en_name = config.get("en_corpus", "raw") +path2en_csv = FILEPATH + config.get("en_corpus","input") +corpus_en_path = FILEPATH + config.get("en_corpus", "path") -def main(): - start = time.time() - printlog("Corporization: {0}".format(datetime.now())) - - - #print paths - path_csv_split = path2de_csv.split("/") - printlog(path_csv_split[len(path_csv_split) - 1]) - path_csv_split = path2en_csv.split("/") - printlog(path_csv_split[len(path_csv_split) - 1]) +def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0): - DE_PARSER = spacy.load("de") - EN_PARSER = spacy.load("en") + # print paths + path_csv_split = path2_csv.split("/") + filename = path_csv_split[len(path_csv_split) - 1] - raw_de_corpus = textacy.Corpus(DE_PARSER) - raw_en_corpus = textacy.Corpus(EN_PARSER) + printlog("Corporization of {0} at {1}".format(filename,datetime.now())) + raw_corpus = textacy.Corpus(lang) ## add files to textacy-corpi, - printlog("Add texts to textacy-corpi") + printlog("Add texts to {0}_textacy-corpi".format(lang)) - raw_de_corpus.add_texts( - ticketcsv_to_textStream(path2de_csv, content_collumn_name), - ticket_csv_to_DictStream(path2de_csv, metaliste) - ) - - raw_en_corpus.add_texts( - ticketcsv_to_textStream(path2en_csv, content_collumn_name), - ticket_csv_to_DictStream(path2en_csv, metaliste) + raw_corpus.add_texts( + ticketcsv_to_textStream(path2_csv, content_collumn_name), + ticket_csv_to_DictStream(path2_csv, metaliste) ) # leere docs aus corpi kicken - raw_de_corpus.remove(lambda doc: len(doc) == 0) - raw_en_corpus.remove(lambda doc: len(doc) == 0) + raw_corpus.remove(lambda doc: len(doc) == 0) + + #random Doc printen + for i in range(printrandom): + printRandomDoc(raw_corpus) - #for i in range(20): - # printRandomDoc(raw_de_corpus) - # printRandomDoc(raw_en_corpus) + # save corpus + raw_name = lang + "_raw_ticket" + save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name) - #save corpi - save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name) - save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name) +def main(): + start = time.time() + ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de") + + ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en") + end = time.time() printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60)) diff --git a/init.py b/init.py index 68d5f8c..596190f 100644 --- a/init.py +++ b/init.py @@ -1,29 +1,24 @@ # -*- coding: utf-8 -*- -from datetime import datetime - -import time -import logging -from stop_words import get_stop_words - -#import words as words -from nltk.corpus import stopwords as nltk_stopwords -from collections import Counter -import csv -import re -import xml.etree.ElementTree as ET -import spacy -import textacy -from scipy import * -import sys -csv.field_size_limit(sys.maxsize) -import pickle -import configparser as ConfigParser from miscellaneous import * +from stop_words import get_stop_words +import csv +import sys +import xml.etree.ElementTree as ET + +from nltk.corpus import stopwords as nltk_stopwords + +from collections import Counter +import time +from datetime import datetime +import os + +csv.field_size_limit(sys.maxsize) +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config -config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" +config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: @@ -70,7 +65,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False): Result will be used as lemma_dict["word"] --> lemma - :param path2lexicalentries: str + :param path2wordnet: str :param returnall: bool if True, also return , word2synsets, synset2Words :return: dictionaries: thesaurus """ @@ -242,34 +237,34 @@ def build_words_for_spellchecking(path2words): ################################################################################################## # THESAURUS -path2wordnet = config.get("thesaurus","input") -path2thesaurus_dict = config.get("thesaurus","pickle_file") +path2wordnet = FILEPATH + config.get("thesaurus","input") +path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file") # SPELLCHECKING -path2words_file = config.get("spellchecking","input") -path2wordlist = config.get("spellchecking","pickle_file") +path2words_file = FILEPATH + config.get("spellchecking","input") +path2wordlist = FILEPATH + config.get("spellchecking","pickle_file") # LEMMA -path2lemma_file = config.get("lemmatization","input") -path2lemmadict = config.get("lemmatization","pickle_file") +path2lemma_file = FILEPATH + config.get("lemmatization","input") +path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file") # NOMEN -nouns1 = config.get("nouns","input1") -nouns2 = config.get("nouns","input2") -path2nouns_list = config.get("nouns","pickle_file") +nouns1 = FILEPATH + config.get("nouns","input1") +nouns2 = FILEPATH + config.get("nouns","input2") +path2nouns_list = FILEPATH + config.get("nouns","pickle_file") # VORNAMEN -firstnames_txt = config.get("firstnames","input") -path2firstnameslist = config.get("firstnames","pickle_file") +firstnames_txt = FILEPATH + config.get("firstnames","input") +path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file") # STOPWORDS -stop1 = config.get("de_stopwords","input1") -stop2 = config.get("de_stopwords","input2") -stop3 = config.get("de_stopwords","input3") -path2stopwordlist = config.get("de_stopwords","pickle_file") +stop1 = FILEPATH + config.get("de_stopwords","input1") +stop2 = FILEPATH + config.get("de_stopwords","input2") +stop3 = FILEPATH + config.get("de_stopwords","input3") +path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file") diff --git a/main.py b/main.py index 9b821d3..faa9c8e 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- - +import time import init import corporization import preprocessing @@ -8,7 +8,7 @@ from miscellaneous import * # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &" - +start = time.time() init.main() printlog("") @@ -19,3 +19,5 @@ printlog("") preprocessing.main() printlog("") +end = time.time() +printlog("Total Time Elapsed: {0} min".format((end - start) / 60)) diff --git a/miscellaneous.py b/miscellaneous.py index 92f19a0..debe414 100644 --- a/miscellaneous.py +++ b/miscellaneous.py @@ -1,87 +1,25 @@ # -*- coding: utf-8 -*- -import random - -import time - -from pathlib import Path - -from datetime import datetime -import logging -from nltk.corpus import stopwords -import csv -import functools -import re -import xml.etree.ElementTree as ET -import spacy -import textacy -from scipy import * -import sys - -from datetime import datetime - - -import time -start = time.time() - -import logging -from nltk.corpus import stopwords -import csv -import functools -import re -import xml.etree.ElementTree as ET -import spacy -import textacy -from scipy import * -import sys -csv.field_size_limit(sys.maxsize) - - -import time - -import enchant - -start = time.time() - -import logging - -import csv -import functools -import os.path -import re -import subprocess -import time -import xml.etree.ElementTree as ET -import sys -import spacy -import textacy -from scipy import * -from textacy import Vectorizer -import warnings import configparser as ConfigParser -import sys -import hunspell -from postal.parser import parse_address - -from datetime import datetime - -import time -import logging -from nltk.corpus import stopwords as nltk_stopwords -from collections import Counter import csv +import functools +import logging +import random import re -import xml.etree.ElementTree as ET +import sys +from pathlib import Path +import pickle import spacy import textacy from scipy import * -import sys +import os + csv.field_size_limit(sys.maxsize) -import pickle +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config -config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" +config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: @@ -90,7 +28,7 @@ with open(config_ini) as f: # config logging -filename = config.get("logging","filename") +filename = FILEPATH + config.get("logging","filename") level = config.get("logging","level") if level == "INFO": level = logging.INFO @@ -188,10 +126,13 @@ def printRandomDoc(textacyCorpus): :param textacyCorpus: """ print() - printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) - randIndex = int((len(textacyCorpus) - 1) * random.random()) - printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, - textacyCorpus[randIndex].metadata)) + if len(textacyCorpus) == 0: + printlog("NO DOCS IN CORPUS") + else: + printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) + randIndex = int((len(textacyCorpus) - 1) * random.random()) + printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, + textacyCorpus[randIndex].metadata)) print() @@ -239,14 +180,14 @@ def load_corpus(corpus_path, corpus_name, lang="de"): Load textacy-Corpus including spacy-parser out from file :param corpus_path: str :param corpus_name: str (should content the language like "_de_") - :param lang: str language code) + :param lang: str (language code) ir spacy.Language :return: texracy.Corpus, spacy.language """ #ckeck for language - if "_de_" in corpus_name: + if "de_" in corpus_name: lang="de" - elif "_en_" in corpus_name: + elif "en_" in corpus_name: lang ="en" diff --git a/preprocessing.py b/preprocessing.py index 6a0e5c7..c7f0d65 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -1,76 +1,23 @@ # -*- coding: utf-8 -*- from datetime import datetime -print(datetime.now()) -from datetime import datetime - -import time -import logging -from stop_words import get_stop_words - -#import words as words -from nltk.corpus import stopwords as nltk_stopwords -from collections import Counter import csv -import re -import xml.etree.ElementTree as ET -import spacy -import textacy -from scipy import * import sys -csv.field_size_limit(sys.maxsize) -import pickle -import configparser as ConfigParser from miscellaneous import * - - - - - - - -import time - - - - from datetime import datetime -import logging -from nltk.corpus import stopwords -import csv -import functools -import re -import xml.etree.ElementTree as ET -import spacy -import textacy -from scipy import * -import sys -csv.field_size_limit(sys.maxsize) - - import time - -import logging -from nltk.corpus import stopwords -import csv -import functools -import re -import xml.etree.ElementTree as ET -import spacy import textacy from scipy import * -import sys + +import os + csv.field_size_limit(sys.maxsize) - - - -import pickle - +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config -config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" +config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: @@ -178,7 +125,6 @@ def getFirstSynonym(word, thesaurus=THESAURUS): # http://wortschatz.uni-leipzig.de/en/download import re -from collections import Counter def words(text): return re.findall(r'\w+', text.lower()) @@ -250,7 +196,7 @@ def stringcleaning(stringstream): string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) # cut_after - word = "gruss" + word = "gruss" #idee addressen enfernen --> postal.parser string = string.rpartition(word)[0] if word in string else string # lemmatize @@ -347,29 +293,23 @@ def processDictstream(dictstream, funcdict, parser): # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &" -path2thesaurus_dict = config.get("thesaurus","pickle_file") +path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file") -path2wordsdict = config.get("spellchecking", "pickle_file") +path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file") -path2lemmadict = config.get("lemmatization","pickle_file") +path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file") -path2nouns_list = config.get("nouns","pickle_file") +path2nouns_list = FILEPATH + config.get("nouns","pickle_file") -path2firstnameslist = config.get("firstnames","pickle_file") +path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file") -path2stopwordlist = config.get("de_stopwords","pickle_file") +path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file") -corpus_de_path = config.get("de_corpus", "path") -raw_de_name = config.get("de_corpus", "raw") -pre_de_name = config.get("de_corpus", "pre") +corpus_de_path = FILEPATH + config.get("de_corpus", "path") - - -corpus_en_path = config.get("en_corpus", "path") -raw_en_name = config.get("en_corpus", "raw") -pre_en_name = config.get("en_corpus", "pre") +corpus_en_path = FILEPATH + config.get("en_corpus", "path") custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus", @@ -383,10 +323,8 @@ custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanke "funktionieren", "kollege", "pruefen", "hoffen" ] - filter_tokens = [ # removeENT(["PERSON"]), - # idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser keepNouns(), @@ -403,8 +341,8 @@ filter_tokens = [ ] - - +#todo filtertokens haut alle raus +filter_tokens = None clean_in_meta = { "Solution": [removePOS(["SPACE"])], @@ -412,17 +350,43 @@ clean_in_meta = { "categoryName": [removePOS(["SPACE", "PUNCT"])] } +def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10): + + printlog("Preprocess {0}_corpus at {1}".format(lang,datetime.now())) + + rawCorpus_name = lang + "_raw_ticket" + preCorpus_name = lang + "_pre_ticket" + + #load raw corpus and create new one + raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path) + + corpus = textacy.Corpus(parser) + + + ## process and add files to textacy-corpi, + corpus.add_texts( + processContentstream(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser), + processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser) + ) + + + # leere docs aus corpi kicken + corpus.remove(lambda doc: len(doc) == 0) + + + for i in range(printrandom): + printRandomDoc(corpus) - + #save corpus + save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name) def main(): start = time.time() - printlog("Preprocessing: {0}".format(datetime.now())) THESAURUS = load_obj(path2thesaurus_dict) @@ -432,44 +396,9 @@ def main(): NOUNS = load_obj(path2nouns_list) VORNAMEN = load_obj(path2firstnameslist) + preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" ) - - #load raw corpus and create new one - raw_de_corpus, DE_PARSER = load_corpus(corpus_name=raw_de_name, corpus_path=corpus_de_path) - raw_en_corpus, EN_PARSER = load_corpus(corpus_name=raw_en_name, corpus_path=corpus_en_path) - - de_corpus = textacy.Corpus(DE_PARSER) - en_corpus = textacy.Corpus(EN_PARSER) - - - - ## process and add files to textacy-corpi, - printlog("Preprocess and add texts to textacy-corpi") - de_corpus.add_texts( - processContentstream(corpus2Text(raw_de_corpus), token_filterlist=filter_tokens, parser=DE_PARSER), - processDictstream(corpus2Meta(raw_de_corpus), clean_in_meta,parser=raw_de_corpus.lang) - ) - en_corpus.add_texts( - processContentstream(corpus2Text(raw_en_corpus), token_filterlist=filter_tokens, parser=EN_PARSER), - processDictstream(corpus2Meta(raw_en_corpus), clean_in_meta,parser=raw_en_corpus.lang) - ) - - - # leere docs aus corpi kicken - de_corpus.remove(lambda doc: len(doc) == 0) - en_corpus.remove(lambda doc: len(doc) == 0) - - - for i in range(20): - printRandomDoc(de_corpus) - #printRandomDoc(en_corpus) - - - - #save corpi - save_corpus(corpus=de_corpus, corpus_path=corpus_de_path, corpus_name=pre_de_name) - save_corpus(corpus=en_corpus, corpus_path=corpus_en_path, corpus_name=pre_en_name) - + preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" ) end = time.time() diff --git a/testra.py b/testra.py index f5a5b4f..843d548 100644 --- a/testra.py +++ b/testra.py @@ -1,10 +1,11 @@ # -*- coding: utf-8 -*- + import re import time import json -import spacy -import textacy +#import spacy +#import textacy from functools import reduce start = time.time() @@ -15,7 +16,6 @@ from datetime import datetime import xml.etree.ElementTree as ET -print(datetime.now()) """ PARSER=spacy.load("de") @@ -55,19 +55,8 @@ print(corpi) """ -import pickle - -def save_obj(obj, path): - with open(path + '.pkl', 'wb') as f: - pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) - -def load_obj(path ): - with open(path + '.pkl', 'rb') as f: - return pickle.load(f) - - - +""" def load_corpus(corpus_path, corpus_name, lang="de"): from pathlib import Path @@ -95,7 +84,7 @@ def load_corpus(corpus_path, corpus_name, lang="de"): corpus.add_doc( textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) return corpus - +""" import os a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt" b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt" @@ -103,37 +92,16 @@ d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3. c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt" -liste = [a,b,c,d] -de_filepaths = [path for path in liste if os.path.basename(path).split("_")[0]=='de' and os.path.basename(path).split("_")[1]=='stopwords'] - -from nltk.corpus import stopwords as nltk_stopwords - -from stop_words import get_stop_words -import spacy -from miscellaneous import * -# from packages -de_stop_words1 = list(get_stop_words("de")) - -de_stop_words2 = list(nltk_stopwords.words('german')) - -de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS) - -# from files -de_stop_words_list = [list(textacy.fileio.read_file_lines(path)) for path in de_filepaths] -de_stop_words4 = [item for sublist in de_stop_words_list for item in sublist] -#print(de_stop_words4) - -de_stop_words = list(set(map(replaceRockDots(),list(map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))))) -print(len(de_stop_words)) -# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testra.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_testra.log &" +scriptpath = os.path.dirname(os.path.realpath(__file__)) +""" # THESAURUS lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml" lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" @@ -212,7 +180,7 @@ def build_thesaurus(path2lexicalentries):#, path2synsets): pass return thesaurus - """ + for r in synroot: for element in r: @@ -223,9 +191,8 @@ def build_thesaurus(path2lexicalentries):#, path2synsets): if id not in synset2Words.keys(): synset2Words[id] = "WORD" - """ - +""" """ from postal.parser import parse_address @@ -557,6 +524,7 @@ de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words)) textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt") """ + end = time.time() print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start)) diff --git a/topicModeling.py b/topicModeling.py index 3e61672..75dbe12 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -94,9 +94,6 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en printlog("doc_term_matrix: {0}".format(doc_term_matrix)) printlog("id2term: {0}".format(id2term)) - - - corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/" corpus_name = "de_corpus" @@ -104,11 +101,6 @@ corpus_name = "de_corpus" de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path) -for i in range(5): - printRandomDoc(de_corpus) - - - # todo gescheites tf(-idf) maß finden ngrams = 1 @@ -128,6 +120,7 @@ printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) """ + # build citionary of ticketcategories labelist = [] @@ -139,6 +132,7 @@ LABELDICT = {k: v for v, k in enumerate(labelist)} printlog(str("LABELDICT: {0}".format(LABELDICT))) + def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False, corpus=de_corpus): printlog(