weiter aufgeräumt

This commit is contained in:
jannis.grundmann 2017-10-17 10:13:49 +02:00
parent 56c8bce2d7
commit 17e45c30af
8 changed files with 170 additions and 388 deletions

View File

@ -1,58 +1,53 @@
[thesaurus] [thesaurus]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml input = deWordNet.xml
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_dict.pkl pickle_file = thesaurus_dict.pkl
[spellchecking] [spellchecking]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt input = deu_news_2015_1M-sentences.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_dict.pkl pickle_file = words_dict.pkl
[lemmatization] [lemmatization]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt input = lemmas.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl pickle_file = lemma_dict.pkl
[nouns] [nouns]
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt input1 = nomen.txt
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt input2 = nomen2.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl pickle_file = nouns_list.pkl
[firstnames] [firstnames]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt input = firstnames.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl pickle_file = firstnames_list.pkl
[de_stopwords] [de_stopwords]
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt input1 = de_stopwords_1.txt
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt input2 = de_stopwords_2.txt
input3 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt input3 = de_stopwords_3.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl pickle_file = stopwords_list.pkl
[logging] [logging]
level = INFO level = INFO
filename = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log filename = topicModelTickets.log
[de_corpus] [de_corpus]
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv #input = M42-Export/Tickets_med.csv
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv #input = M42-Export/Tickets_small.csv
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv #input = M42-Export/Tickets_mini.csv
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv input = M42-Export/de_tickets.csv
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
raw = de_raw_ticket
pre = de_pre_ticket
path = corpi/
[en_corpus] [en_corpus]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv input = M42-Export/en_tickets.csv
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ path = corpi/
raw = en_raw_ticket
pre = en_pre_ticket

View File

@ -1,53 +1,20 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import * from miscellaneous import *
import time import time
from datetime import datetime from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re import re
import xml.etree.ElementTree as ET
import spacy
import textacy import textacy
from scipy import * from scipy import *
import sys import os
csv.field_size_limit(sys.maxsize)
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config # load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
with open(config_ini) as f: with open(config_ini) as f:
@ -128,70 +95,61 @@ metaliste = [
content_collumn_name = config.get("tickets","content_collumn_name") content_collumn_name = config.get("tickets","content_collumn_name")
metaliste = config.get("tickets","metaliste") metaliste = config.get("tickets","metaliste").split(",")
path2de_csv = config.get("de_corpus","input") path2de_csv = FILEPATH + config.get("de_corpus","input")
corpus_de_path = config.get("de_corpus", "path") corpus_de_path = FILEPATH + config.get("de_corpus", "path")
raw_de_name = config.get("de_corpus", "raw")
path2en_csv = config.get("en_corpus","input") path2en_csv = FILEPATH + config.get("en_corpus","input")
corpus_en_path = config.get("en_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path")
raw_en_name = config.get("en_corpus", "raw")
def main(): def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
start = time.time()
printlog("Corporization: {0}".format(datetime.now()))
#print paths
path_csv_split = path2de_csv.split("/")
printlog(path_csv_split[len(path_csv_split) - 1])
path_csv_split = path2en_csv.split("/")
printlog(path_csv_split[len(path_csv_split) - 1])
DE_PARSER = spacy.load("de") # print paths
EN_PARSER = spacy.load("en") path_csv_split = path2_csv.split("/")
filename = path_csv_split[len(path_csv_split) - 1]
raw_de_corpus = textacy.Corpus(DE_PARSER) printlog("Corporization of {0} at {1}".format(filename,datetime.now()))
raw_en_corpus = textacy.Corpus(EN_PARSER)
raw_corpus = textacy.Corpus(lang)
## add files to textacy-corpi, ## add files to textacy-corpi,
printlog("Add texts to textacy-corpi") printlog("Add texts to {0}_textacy-corpi".format(lang))
raw_de_corpus.add_texts( raw_corpus.add_texts(
ticketcsv_to_textStream(path2de_csv, content_collumn_name), ticketcsv_to_textStream(path2_csv, content_collumn_name),
ticket_csv_to_DictStream(path2de_csv, metaliste) ticket_csv_to_DictStream(path2_csv, metaliste)
)
raw_en_corpus.add_texts(
ticketcsv_to_textStream(path2en_csv, content_collumn_name),
ticket_csv_to_DictStream(path2en_csv, metaliste)
) )
# leere docs aus corpi kicken # leere docs aus corpi kicken
raw_de_corpus.remove(lambda doc: len(doc) == 0) raw_corpus.remove(lambda doc: len(doc) == 0)
raw_en_corpus.remove(lambda doc: len(doc) == 0)
#random Doc printen
for i in range(printrandom):
printRandomDoc(raw_corpus)
#for i in range(20): # save corpus
# printRandomDoc(raw_de_corpus) raw_name = lang + "_raw_ticket"
# printRandomDoc(raw_en_corpus) save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
#save corpi def main():
save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name) start = time.time()
save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name)
ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de")
ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en")
end = time.time() end = time.time()
printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60)) printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60))

67
init.py
View File

@ -1,29 +1,24 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import * from miscellaneous import *
from stop_words import get_stop_words
import csv
import sys
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import time
from datetime import datetime
import os
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config # load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
with open(config_ini) as f: with open(config_ini) as f:
@ -70,7 +65,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
Result will be used as lemma_dict["word"] --> lemma Result will be used as lemma_dict["word"] --> lemma
:param path2lexicalentries: str :param path2wordnet: str
:param returnall: bool if True, also return , word2synsets, synset2Words :param returnall: bool if True, also return , word2synsets, synset2Words
:return: dictionaries: thesaurus :return: dictionaries: thesaurus
""" """
@ -242,34 +237,34 @@ def build_words_for_spellchecking(path2words):
################################################################################################## ##################################################################################################
# THESAURUS # THESAURUS
path2wordnet = config.get("thesaurus","input") path2wordnet = FILEPATH + config.get("thesaurus","input")
path2thesaurus_dict = config.get("thesaurus","pickle_file") path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
# SPELLCHECKING # SPELLCHECKING
path2words_file = config.get("spellchecking","input") path2words_file = FILEPATH + config.get("spellchecking","input")
path2wordlist = config.get("spellchecking","pickle_file") path2wordlist = FILEPATH + config.get("spellchecking","pickle_file")
# LEMMA # LEMMA
path2lemma_file = config.get("lemmatization","input") path2lemma_file = FILEPATH + config.get("lemmatization","input")
path2lemmadict = config.get("lemmatization","pickle_file") path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
# NOMEN # NOMEN
nouns1 = config.get("nouns","input1") nouns1 = FILEPATH + config.get("nouns","input1")
nouns2 = config.get("nouns","input2") nouns2 = FILEPATH + config.get("nouns","input2")
path2nouns_list = config.get("nouns","pickle_file") path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
# VORNAMEN # VORNAMEN
firstnames_txt = config.get("firstnames","input") firstnames_txt = FILEPATH + config.get("firstnames","input")
path2firstnameslist = config.get("firstnames","pickle_file") path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
# STOPWORDS # STOPWORDS
stop1 = config.get("de_stopwords","input1") stop1 = FILEPATH + config.get("de_stopwords","input1")
stop2 = config.get("de_stopwords","input2") stop2 = FILEPATH + config.get("de_stopwords","input2")
stop3 = config.get("de_stopwords","input3") stop3 = FILEPATH + config.get("de_stopwords","input3")
path2stopwordlist = config.get("de_stopwords","pickle_file") path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import time
import init import init
import corporization import corporization
import preprocessing import preprocessing
@ -8,7 +8,7 @@ from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &" # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
start = time.time()
init.main() init.main()
printlog("") printlog("")
@ -19,3 +19,5 @@ printlog("")
preprocessing.main() preprocessing.main()
printlog("") printlog("")
end = time.time()
printlog("Total Time Elapsed: {0} min".format((end - start) / 60))

View File

@ -1,87 +1,25 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import random
import time
from pathlib import Path
from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
from datetime import datetime
import time
start = time.time()
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import time
import enchant
start = time.time()
import logging
import csv
import functools
import os.path
import re
import subprocess
import time
import xml.etree.ElementTree as ET
import sys
import spacy
import textacy
from scipy import *
from textacy import Vectorizer
import warnings
import configparser as ConfigParser import configparser as ConfigParser
import sys
import hunspell
from postal.parser import parse_address
from datetime import datetime
import time
import logging
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv import csv
import functools
import logging
import random
import re import re
import xml.etree.ElementTree as ET import sys
from pathlib import Path
import pickle
import spacy import spacy
import textacy import textacy
from scipy import * from scipy import *
import sys import os
csv.field_size_limit(sys.maxsize) csv.field_size_limit(sys.maxsize)
import pickle FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config # load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
with open(config_ini) as f: with open(config_ini) as f:
@ -90,7 +28,7 @@ with open(config_ini) as f:
# config logging # config logging
filename = config.get("logging","filename") filename = FILEPATH + config.get("logging","filename")
level = config.get("logging","level") level = config.get("logging","level")
if level == "INFO": if level == "INFO":
level = logging.INFO level = logging.INFO
@ -188,10 +126,13 @@ def printRandomDoc(textacyCorpus):
:param textacyCorpus: :param textacyCorpus:
""" """
print() print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) if len(textacyCorpus) == 0:
randIndex = int((len(textacyCorpus) - 1) * random.random()) printlog("NO DOCS IN CORPUS")
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, else:
textacyCorpus[randIndex].metadata)) printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata))
print() print()
@ -239,14 +180,14 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
Load textacy-Corpus including spacy-parser out from file Load textacy-Corpus including spacy-parser out from file
:param corpus_path: str :param corpus_path: str
:param corpus_name: str (should content the language like "_de_") :param corpus_name: str (should content the language like "_de_")
:param lang: str language code) :param lang: str (language code) ir spacy.Language
:return: texracy.Corpus, spacy.language :return: texracy.Corpus, spacy.language
""" """
#ckeck for language #ckeck for language
if "_de_" in corpus_name: if "de_" in corpus_name:
lang="de" lang="de"
elif "_en_" in corpus_name: elif "en_" in corpus_name:
lang ="en" lang ="en"

View File

@ -1,76 +1,23 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from datetime import datetime from datetime import datetime
print(datetime.now())
from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import * from miscellaneous import *
import time
from datetime import datetime from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import time import time
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy import textacy
from scipy import * from scipy import *
import sys
import os
csv.field_size_limit(sys.maxsize) csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
import pickle
# load config # load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
with open(config_ini) as f: with open(config_ini) as f:
@ -178,7 +125,6 @@ def getFirstSynonym(word, thesaurus=THESAURUS):
# http://wortschatz.uni-leipzig.de/en/download # http://wortschatz.uni-leipzig.de/en/download
import re import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower()) def words(text): return re.findall(r'\w+', text.lower())
@ -250,7 +196,7 @@ def stringcleaning(stringstream):
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
# cut_after # cut_after
word = "gruss" word = "gruss" #idee addressen enfernen --> postal.parser
string = string.rpartition(word)[0] if word in string else string string = string.rpartition(word)[0] if word in string else string
# lemmatize # lemmatize
@ -347,29 +293,23 @@ def processDictstream(dictstream, funcdict, parser):
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &" # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
path2thesaurus_dict = config.get("thesaurus","pickle_file") path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
path2wordsdict = config.get("spellchecking", "pickle_file") path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
path2lemmadict = config.get("lemmatization","pickle_file") path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
path2nouns_list = config.get("nouns","pickle_file") path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
path2firstnameslist = config.get("firstnames","pickle_file") path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
path2stopwordlist = config.get("de_stopwords","pickle_file") path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")
corpus_de_path = config.get("de_corpus", "path") corpus_de_path = FILEPATH + config.get("de_corpus", "path")
raw_de_name = config.get("de_corpus", "raw")
pre_de_name = config.get("de_corpus", "pre")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
corpus_en_path = config.get("en_corpus", "path")
raw_en_name = config.get("en_corpus", "raw")
pre_en_name = config.get("en_corpus", "pre")
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus", custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
@ -383,10 +323,8 @@ custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanke
"funktionieren", "kollege", "pruefen", "hoffen" "funktionieren", "kollege", "pruefen", "hoffen"
] ]
filter_tokens = [ filter_tokens = [
# removeENT(["PERSON"]), # removeENT(["PERSON"]),
# idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
keepNouns(), keepNouns(),
@ -403,8 +341,8 @@ filter_tokens = [
] ]
#todo filtertokens haut alle raus
filter_tokens = None
clean_in_meta = { clean_in_meta = {
"Solution": [removePOS(["SPACE"])], "Solution": [removePOS(["SPACE"])],
@ -412,17 +350,43 @@ clean_in_meta = {
"categoryName": [removePOS(["SPACE", "PUNCT"])] "categoryName": [removePOS(["SPACE", "PUNCT"])]
} }
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
printlog("Preprocess {0}_corpus at {1}".format(lang,datetime.now()))
rawCorpus_name = lang + "_raw_ticket"
preCorpus_name = lang + "_pre_ticket"
#load raw corpus and create new one
raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)
corpus = textacy.Corpus(parser)
## process and add files to textacy-corpi,
corpus.add_texts(
processContentstream(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser),
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
)
# leere docs aus corpi kicken
corpus.remove(lambda doc: len(doc) == 0)
for i in range(printrandom):
printRandomDoc(corpus)
#save corpus
save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name)
def main(): def main():
start = time.time() start = time.time()
printlog("Preprocessing: {0}".format(datetime.now()))
THESAURUS = load_obj(path2thesaurus_dict) THESAURUS = load_obj(path2thesaurus_dict)
@ -432,44 +396,9 @@ def main():
NOUNS = load_obj(path2nouns_list) NOUNS = load_obj(path2nouns_list)
VORNAMEN = load_obj(path2firstnameslist) VORNAMEN = load_obj(path2firstnameslist)
preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" )
preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
#load raw corpus and create new one
raw_de_corpus, DE_PARSER = load_corpus(corpus_name=raw_de_name, corpus_path=corpus_de_path)
raw_en_corpus, EN_PARSER = load_corpus(corpus_name=raw_en_name, corpus_path=corpus_en_path)
de_corpus = textacy.Corpus(DE_PARSER)
en_corpus = textacy.Corpus(EN_PARSER)
## process and add files to textacy-corpi,
printlog("Preprocess and add texts to textacy-corpi")
de_corpus.add_texts(
processContentstream(corpus2Text(raw_de_corpus), token_filterlist=filter_tokens, parser=DE_PARSER),
processDictstream(corpus2Meta(raw_de_corpus), clean_in_meta,parser=raw_de_corpus.lang)
)
en_corpus.add_texts(
processContentstream(corpus2Text(raw_en_corpus), token_filterlist=filter_tokens, parser=EN_PARSER),
processDictstream(corpus2Meta(raw_en_corpus), clean_in_meta,parser=raw_en_corpus.lang)
)
# leere docs aus corpi kicken
de_corpus.remove(lambda doc: len(doc) == 0)
en_corpus.remove(lambda doc: len(doc) == 0)
for i in range(20):
printRandomDoc(de_corpus)
#printRandomDoc(en_corpus)
#save corpi
save_corpus(corpus=de_corpus, corpus_path=corpus_de_path, corpus_name=pre_de_name)
save_corpus(corpus=en_corpus, corpus_path=corpus_en_path, corpus_name=pre_en_name)
end = time.time() end = time.time()

View File

@ -1,10 +1,11 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re import re
import time import time
import json import json
import spacy #import spacy
import textacy #import textacy
from functools import reduce from functools import reduce
start = time.time() start = time.time()
@ -15,7 +16,6 @@ from datetime import datetime
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
print(datetime.now())
""" """
PARSER=spacy.load("de") PARSER=spacy.load("de")
@ -55,19 +55,8 @@ print(corpi)
""" """
import pickle
def save_obj(obj, path):
with open(path + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path ):
with open(path + '.pkl', 'rb') as f:
return pickle.load(f)
"""
def load_corpus(corpus_path, corpus_name, lang="de"): def load_corpus(corpus_path, corpus_name, lang="de"):
from pathlib import Path from pathlib import Path
@ -95,7 +84,7 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
corpus.add_doc( corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus return corpus
"""
import os import os
a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt" a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt"
b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt" b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt"
@ -103,37 +92,16 @@ d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.
c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt" c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt"
liste = [a,b,c,d]
de_filepaths = [path for path in liste if os.path.basename(path).split("_")[0]=='de' and os.path.basename(path).split("_")[1]=='stopwords']
from nltk.corpus import stopwords as nltk_stopwords
from stop_words import get_stop_words
import spacy
from miscellaneous import *
# from packages
de_stop_words1 = list(get_stop_words("de"))
de_stop_words2 = list(nltk_stopwords.words('german'))
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
# from files
de_stop_words_list = [list(textacy.fileio.read_file_lines(path)) for path in de_filepaths]
de_stop_words4 = [item for sublist in de_stop_words_list for item in sublist]
#print(de_stop_words4)
de_stop_words = list(set(map(replaceRockDots(),list(map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
print(len(de_stop_words))
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testra.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_testra.log &"
scriptpath = os.path.dirname(os.path.realpath(__file__))
"""
# THESAURUS # THESAURUS
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml" lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
@ -212,7 +180,7 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
pass pass
return thesaurus return thesaurus
"""
for r in synroot: for r in synroot:
for element in r: for element in r:
@ -223,9 +191,8 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
if id not in synset2Words.keys(): if id not in synset2Words.keys():
synset2Words[id] = "WORD" synset2Words[id] = "WORD"
"""
"""
""" """
from postal.parser import parse_address from postal.parser import parse_address
@ -557,6 +524,7 @@ de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt") textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
""" """
end = time.time() end = time.time()
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start)) print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))

View File

@ -94,9 +94,6 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
printlog("doc_term_matrix: {0}".format(doc_term_matrix)) printlog("doc_term_matrix: {0}".format(doc_term_matrix))
printlog("id2term: {0}".format(id2term)) printlog("id2term: {0}".format(id2term))
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/" corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
corpus_name = "de_corpus" corpus_name = "de_corpus"
@ -104,11 +101,6 @@ corpus_name = "de_corpus"
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path) de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
for i in range(5):
printRandomDoc(de_corpus)
# todo gescheites tf(-idf) maß finden # todo gescheites tf(-idf) maß finden
ngrams = 1 ngrams = 1
@ -128,6 +120,7 @@ printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
""" """
# build citionary of ticketcategories # build citionary of ticketcategories
labelist = [] labelist = []
@ -139,6 +132,7 @@ LABELDICT = {k: v for v, k in enumerate(labelist)}
printlog(str("LABELDICT: {0}".format(LABELDICT))) printlog(str("LABELDICT: {0}".format(LABELDICT)))
def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False, def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False,
corpus=de_corpus): corpus=de_corpus):
printlog( printlog(