aufgeräumt

This commit is contained in:
jannis.grundmann 2017-10-16 14:01:38 +02:00
parent 4fe12679fb
commit 56c8bce2d7
15 changed files with 944 additions and 1023 deletions

View File

@ -1,21 +1,67 @@
[filepath] [thesaurus]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml
thesauruspath = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_dict.pkl
path2xml = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ticketSamples.xml
path2csv = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv
small = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
logfile = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log [spellchecking]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_dict.pkl
lemmas = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmatization-de.txt
[lemmatization]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl
[nouns]
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl
[firstnames]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl
[de_stopwords]
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt
input3 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl
[logging]
level = INFO
filename = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
[de_corpus]
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
raw = de_raw_ticket
pre = de_pre_ticket
[en_corpus]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
raw = en_raw_ticket
pre = en_pre_ticket
[tickets]
content_collumn_name = Description
metaliste = TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution
language = de
[preprocessing] [preprocessing]

View File

@ -1,6 +1,35 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from datetime import datetime
import time import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import *
import time
from datetime import datetime from datetime import datetime
@ -17,87 +46,15 @@ import sys
csv.field_size_limit(sys.maxsize) csv.field_size_limit(sys.maxsize)
# load config
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
content_collumn_name = "Description"
metaliste = [
"TicketNumber",
"Subject",
"CreatedDate",
"categoryName",
"Impact",
"Urgency",
"BenutzerID",
"VerantwortlicherID",
"EigentuemerID",
"Solution"
]
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "de_raw_ticketCorpus"
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
# todo configuration file
"""
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
with open(config_ini) as f: with open(config_ini) as f:
config.read_file(f) config.read_file(f)
"""
# config logging
logging.basicConfig(filename=logfile, level=logging.INFO)
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level == "INFO":
logging.info(string)
elif level == "DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
def printRandomDoc(textacyCorpus):
import random
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata))
print()
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str): def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
@ -146,75 +103,93 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
yield metadata yield metadata
def save_corpus(corpus, corpus_path, corpus_name, parser):
"""
# save stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path, "w") as file:
parser.vocab.strings.dump(file)
#todo save vocab?
"""
# save parser
parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath)
# save content
contentpath = corpus_path + corpus_name + "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
# save meta
metapath = corpus_path + corpus_name + "_meta.json"
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
################################################################################################## ##################################################################################################
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
"""
content_collumn_name = "Description"
metaliste = [
"TicketNumber",
"Subject",
"CreatedDate",
"categoryName",
"Impact",
"Urgency",
"BenutzerID",
"VerantwortlicherID",
"EigentuemerID",
"Solution"
]
"""
content_collumn_name = config.get("tickets","content_collumn_name")
metaliste = config.get("tickets","metaliste")
path2de_csv = config.get("de_corpus","input")
corpus_de_path = config.get("de_corpus", "path")
raw_de_name = config.get("de_corpus", "raw")
path2en_csv = config.get("en_corpus","input")
corpus_en_path = config.get("en_corpus", "path")
raw_en_name = config.get("en_corpus", "raw")
def main(): def main():
start = time.time()
printlog("Corporization: {0}".format(datetime.now())) printlog("Corporization: {0}".format(datetime.now()))
#print paths
path_csv_split = path2de_csv.split("/") path_csv_split = path2de_csv.split("/")
printlog(path_csv_split[len(path_csv_split) - 1]) printlog(path_csv_split[len(path_csv_split) - 1])
path_csv_split = path2en_csv.split("/") path_csv_split = path2en_csv.split("/")
printlog(path_csv_split[len(path_csv_split) - 1]) printlog(path_csv_split[len(path_csv_split) - 1])
start = time.time()
DE_PARSER = spacy.load("de") DE_PARSER = spacy.load("de")
EN_PARSER = spacy.load("en") EN_PARSER = spacy.load("en")
de_corpus = textacy.Corpus(DE_PARSER) raw_de_corpus = textacy.Corpus(DE_PARSER)
en_corpus = textacy.Corpus(EN_PARSER) raw_en_corpus = textacy.Corpus(EN_PARSER)
## add files to textacy-corpus, ## add files to textacy-corpi,
printlog("Add texts to textacy-corpus") printlog("Add texts to textacy-corpi")
de_corpus.add_texts( raw_de_corpus.add_texts(
ticketcsv_to_textStream(path2de_csv, content_collumn_name), ticketcsv_to_textStream(path2de_csv, content_collumn_name),
ticket_csv_to_DictStream(path2de_csv, metaliste) ticket_csv_to_DictStream(path2de_csv, metaliste)
) )
raw_en_corpus.add_texts(
# leere docs aus corpus kicken ticketcsv_to_textStream(path2en_csv, content_collumn_name),
de_corpus.remove(lambda doc: len(doc) == 0) ticket_csv_to_DictStream(path2en_csv, metaliste)
)
for i in range(20): # leere docs aus corpi kicken
printRandomDoc(de_corpus) raw_de_corpus.remove(lambda doc: len(doc) == 0)
raw_en_corpus.remove(lambda doc: len(doc) == 0)
#save corpus #for i in range(20):
# printRandomDoc(raw_de_corpus)
# printRandomDoc(raw_en_corpus)
save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name,parser=DE_PARSER)
#todo das selbe mit en_corpus #save corpi
save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name)
save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name)

Binary file not shown.

View File

@ -1,9 +0,0 @@
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"subject"}
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"frau hinrichs überdenkt die situation und macht dann neue anträge . dieses ticket wird geschlossen"}
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"faxnummer 3166 wurde unter die telefonnummer 7179 im elektronischen telefonbuch eingetragen"}
{"categoryName":"lan","Subject":"defekte netzwerkdose frage zu vpn","Solution":"hallo herr rauner , die netzwerkdose weist z. z. keine verbindungsprobleme auf . falls doch welche bestehen , melden sie sich bitte bei uns . mit freunldichen grüßen aicha oikrim"}
{"categoryName":"betrieb","Subject":"sso login via browser mit zertifikat","Solution":"der login via zertifikat am sso - dienst mittels firefox und unicard sollte funktionieren . eventuell wurden durch ein browserupdate die einstellungen gelöscht . bitte prüfen sie ob die ca - zertifikate installiert sind : https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" und ob das kryptographie modul im firefox hinterlegt ist : https://service.tu-dortmund.de/group/intra/authentifizierung"}
{"categoryName":"elektronisches telefonbuch","Subject":"telephone contract","Solution":"erledigt"}
{"categoryName":"verwaltung","Subject":"laptop macht komische geräusche","Solution":"herr alexev swetlomier ( hiwi ) küümert sich bereits um das laptop und frau herbst weiß auch bescheid die zur zeit im urlaub ist"}

File diff suppressed because one or more lines are too long

339
init.py
View File

@ -4,6 +4,9 @@ from datetime import datetime
import time import time
import logging import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter from collections import Counter
import csv import csv
@ -15,58 +18,35 @@ from scipy import *
import sys import sys
csv.field_size_limit(sys.maxsize) csv.field_size_limit(sys.maxsize)
import pickle import pickle
import configparser as ConfigParser
from miscellaneous import *
# todo configuration file ? # load config
"""
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
with open(config_ini) as f: with open(config_ini) as f:
config.read_file(f) config.read_file(f)
"""
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"
# config logging
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
logging.basicConfig(filename=logfile, level=logging.INFO)
def create_lemma_dict(path2lemmalist):
"""
Creates a dict out of a file a la:
DE_PARSER = spacy.load("de") l1 w1
EN_PARSER = spacy.load("en") l1 w2
l2 w1
l2 w2
Result will be used as lemma_dict["word"] --> lemma
:param path2lemmalist: str
def replaceRockDots(): :return: dictionary
return lambda string: re.sub(r'[ß]', "ss", """
(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower()))))))) lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(
textacy.fileio.read_file_lines(path2lemmalist))))
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level == "INFO":
logging.info(string)
elif level == "DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
def save_obj(obj, path):
with open(path + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path ):
with open(path + '.pkl', 'rb') as f:
return pickle.load(f)
def create_lemma_dict(lemmalist):
lemma_dict = {} lemma_dict = {}
@ -81,69 +61,22 @@ def create_lemma_dict(lemmalist):
return lemma_dict return lemma_dict
"""
def build_thesaurus(path2lexicalentries, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8")) def build_thesaurus_dict(path2wordnet,returnall=False):
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8")) """
Creates a dict out of the deWordNet
https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
Result will be used as lemma_dict["word"] --> lemma
:param path2lexicalentries: str
:param returnall: bool if True, also return , word2synsets, synset2Words
:return: dictionaries: thesaurus
"""
lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8"))
lexroot = lextree.getroot() lexroot = lextree.getroot()
synroot = syntree.getroot()
thesaurus = []
for r in synroot:
for element in r:
if element.tag == "Synset":
sysnet = []
attrib = element.attrib
id = attrib["id"]
for ro in lexroot:
for elem in ro:
if elem.tag == "LexicalEntry":
subs_dicts = [subentry.attrib for subentry in elem]
# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict
if "synset" in dic.keys():
if dic["synset"] == id:
string = (dic["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# alle punkte raus
string = re.sub(r'[.]', "", string)
# alles in klammern raus
string = re.sub(r"\((.*)\)", " ", string)
# längeres leerzeichen normalisieren
string = textacy.preprocess.normalize_whitespace(string)
sysnet.append(string.lower().strip())
# nach anzhal der wörter in den strings sortieren
sysnet.sort(key=lambda x: len(x.split()))
if len(sysnet) != 0:
# todo warum sind manche leer?
thesaurus.append(sysnet)
return thesaurus
#todo thesaurus in dictionary
"""
def build_thesaurus(path2lexicalentries):#, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
lexroot = lextree.getroot()
#synroot = syntree.getroot()
word2synsets = {} word2synsets = {}
template = {"w1": ["s1", "s2"]} template = {"w1": ["s1", "s2"]}
@ -167,6 +100,9 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
if 'writtenForm' in lex_dict.keys(): if 'writtenForm' in lex_dict.keys():
string = (lex_dict["writtenForm"]) string = (lex_dict["writtenForm"])
if string == "Kennwort":
pass
# replaceRockDots # replaceRockDots
string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ö]', "oe", string)
@ -186,10 +122,12 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
word2synsets[string] = synlist word2synsets[string] = synlist
synset2Words = {} synset2Words = {}
template = {"s1": ["w1","w2"]} template = {"s1": ["w1","w2"]}
for word,synset in word2synsets.items(): for word,synset in word2synsets.items():
if word != '':
for syn in synset: for syn in synset:
if syn not in synset2Words.keys(): if syn not in synset2Words.keys():
synset2Words[syn] = [word] synset2Words[syn] = [word]
@ -203,91 +141,135 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
thesaurus = {} thesaurus = {}
thesaurus_template = {"w1" : "mainsyn"} thesaurus_template = {"w1" : "mainsyn"}
for word,synset in word2synsets.items(): for word,synset in word2synsets.items():
try: try:
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen?
except: except:
pass pass
if returnall:
return thesaurus, word2synsets, synset2Words
else:
return thesaurus return thesaurus
def create_stopword_lists(*paths):
""" """
for r in synroot: creates a list of stoppwords from:
for element in r: spacy
nltk
stop_words
if element.tag == "Synset": :param paths: list of additional filepaths where each file looks like
synset = [] w1
attrib = element.attrib w2
id = attrib["id"] w3
filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt
if id not in synset2Words.keys(): :return: lists: de_stopwords, en_stopwords
synset2Words[id] = "WORD"
""" """
## GERMAN
# from packages
de_stop_words1 = list(get_stop_words("de"))
de_stop_words2 = list(nltk_stopwords.words('german'))
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
#from files
de_filepaths = []
for path in paths:
if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[
1] == 'stopwords':
de_filepaths.append(path)
def create_stopwordlist(): de_stop_words4 = list_from_files(*de_filepaths)
de_stop_words1 = list(map(replaceRockDots(), #combine everything
list( de_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt")
)
)
)
)
de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german')))))
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
return de_stop_words
#todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english'))))
## ENGLISH
# from packages
en_stop_words1 = list(get_stop_words("en"))
en_stop_words2 = list(nltk_stopwords.words('english'))
en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS)
# from files
en_filepaths = [path for path in paths if
os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[
1] == 'stopwords']
en_stop_words4 = list_from_files(*en_filepaths)
########################## Spellchecking ########################################## # combine everything
# http://norvig.com/spell-correct.html en_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
# http://wortschatz.uni-leipzig.de/en/download en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))
return de_stop_words, en_stop_words
def build_words_for_spellchecking(path2words):
"""
create word-Counter for spellchecking
http://norvig.com/spell-correct.html
http://wortschatz.uni-leipzig.de/en/download
http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz
:return: Counter
"""
def words(text): return re.findall(r'\w+', text.lower())
return Counter(words(open(path2words).read()))
def words(text): return re.findall(r'\w+', text.lower())
################################################################################################## ##################################################################################################
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
# außerdem saubere stoppwortliste und nomenliste
# THESAURUS # THESAURUS
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" path2wordnet = config.get("thesaurus","input")
#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" path2thesaurus_dict = config.get("thesaurus","pickle_file")
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
# SPELLCHECKING # SPELLCHECKING
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt' path2words_file = config.get("spellchecking","input")
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list" path2wordlist = config.get("spellchecking","pickle_file")
# LEMMA
path2lemma_file = config.get("lemmatization","input")
path2lemmadict = config.get("lemmatization","pickle_file")
# NOMEN
nouns1 = config.get("nouns","input1")
nouns2 = config.get("nouns","input2")
path2nouns_list = config.get("nouns","pickle_file")
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict" # VORNAMEN
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list" firstnames_txt = config.get("firstnames","input")
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list" path2firstnameslist = config.get("firstnames","pickle_file")
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
# STOPWORDS
stop1 = config.get("de_stopwords","input1")
stop2 = config.get("de_stopwords","input2")
stop3 = config.get("de_stopwords","input3")
path2stopwordlist = config.get("de_stopwords","pickle_file")
@ -297,71 +279,42 @@ def main():
printlog("create and save lemma_dict") printlog("create and save lemma_dict")
LEMMAS = list( lemma_dict = create_lemma_dict(path2lemma_file)
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
lemma_dict = create_lemma_dict(LEMMAS)
save_obj(lemma_dict, path2lemmadict) save_obj(lemma_dict, path2lemmadict)
printlog("Build and save Wordlist for Spellchecking") printlog("Build and save Wordlist for Spellchecking")
WORDS = Counter(words(open(path2words).read())) words = build_words_for_spellchecking(path2words_file)
save_obj(WORDS, path2wordlist) save_obj(words, path2wordlist)
printlog("Build and save Thesaurus") printlog("Build and save Thesaurus")
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries) thesaurus = build_thesaurus_dict(path2wordnet)
save_obj(thesaurus, path2thesaurus_dict)
save_obj(THESAURUS, path2thesaurusdict)
printlog("Build and save stoppwortliste") printlog("Build and save stoppwortliste")
de_stop_words = create_stopwordlist() de_stop_words = create_stopword_lists(stop1, stop2, stop3)
save_obj(de_stop_words, path2stopwordlist) save_obj(de_stop_words, path2stopwordlist)
printlog("Build and save nomenliste") printlog("Build and save nomenliste")
NOUNS = list(textacy.fileio.read_file_lines( nouns = list_from_files(nouns1,nouns2)
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list( save_obj(nouns, path2nouns_list)
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
save_obj(NOUNS, path2NOUNSlist)
printlog("Build and save fistnameslist")
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
save_obj(VORNAMEN, path2firstnameslist)
printlog("Build and save firstnameslist")
vornamen = list_from_files(firstnames_txt)
save_obj(vornamen, path2firstnameslist)
end = time.time() end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60))

21
main.py Normal file
View File

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
import init
import corporization
import preprocessing
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
init.main()
printlog("")
corporization.main()
printlog("")
preprocessing.main()
printlog("")

281
miscellaneous.py Normal file
View File

@ -0,0 +1,281 @@
# -*- coding: utf-8 -*-
import random
import time
from pathlib import Path
from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
from datetime import datetime
import time
start = time.time()
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import time
import enchant
start = time.time()
import logging
import csv
import functools
import os.path
import re
import subprocess
import time
import xml.etree.ElementTree as ET
import sys
import spacy
import textacy
from scipy import *
from textacy import Vectorizer
import warnings
import configparser as ConfigParser
import sys
import hunspell
from postal.parser import parse_address
from datetime import datetime
import time
import logging
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
# load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
# config logging
filename = config.get("logging","filename")
level = config.get("logging","level")
if level == "INFO":
level = logging.INFO
elif level == "DEBUG":
level = logging.DEBUG
elif level == "WARNING":
level = logging.WARNING
logging.basicConfig(filename=filename, level=level)
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level == "INFO":
logging.info(string)
elif level == "DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
def get_calling_function():
"""finds the calling function in many decent cases.
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
"""
fr = sys._getframe(1) # inspect.stack()[1][0]
co = fr.f_code
for get in (
lambda: fr.f_globals[co.co_name],
lambda: getattr(fr.f_locals['self'], co.co_name),
lambda: getattr(fr.f_locals['cls'], co.co_name),
lambda: fr.f_back.f_locals[co.co_name], # nested
lambda: fr.f_back.f_locals['func'], # decorators
lambda: fr.f_back.f_locals['meth'],
lambda: fr.f_back.f_locals['f'],
):
try:
func = get()
except (KeyError, AttributeError):
pass
else:
if func.__code__ == co:
return func
raise AttributeError("func not found")
def save_obj(obj, path):
with open(path , 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path):
with open(path, 'rb') as f:
return pickle.load(f)
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss",
(re.sub(r'[ö]', "oe",
(re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
def list_from_files(*paths):
"""
create string-list from file like
n1
n2
n3
:param paths: list(str) or str if single path
:return: list(str)
"""
listlist = []
for path in paths:
listlist.append(list(textacy.fileio.read_file_lines(path)))
#liste von listen zu einer liste
liste = [item for sublist in listlist for item in sublist]
return list(map(textacy.preprocess.normalize_whitespace, liste))
def printRandomDoc(textacyCorpus):
"""
printlogss random doc out of a textacy-Corpus
:param textacyCorpus:
"""
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata))
print()
def save_corpus(corpus, corpus_path, corpus_name):
"""
saves a textacy-corpus including spacy-parser
:param corpus: textacy-Corpus
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
"""
"""
# save stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path, "w") as file:
parser.vocab.strings.dump(file)
#todo save vocab?
"""
# save parser
parser = corpus.spacy_lang
parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath)
# save content
contentpath = corpus_path + corpus_name + "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
# save meta
metapath = corpus_path + corpus_name + "_meta.json"
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
def load_corpus(corpus_path, corpus_name, lang="de"):
"""
Load textacy-Corpus including spacy-parser out from file
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
:param lang: str language code)
:return: texracy.Corpus, spacy.language
"""
#ckeck for language
if "_de_" in corpus_name:
lang="de"
elif "_en_" in corpus_name:
lang ="en"
# load parser
parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
parser.vocab.load_lexemes(vocabpath)
#load corpus
corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_content.bin"
metapath = corpus_path + corpus_name + "_meta.json"
metadata_stream = textacy.fileio.read_json_lines(metapath)
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus, corpus.spacy_lang

View File

@ -420,10 +420,10 @@ custom_words = ["grüßen", "fragen"]
####################'####################'####################'####################'####################'############## ####################'####################'####################'####################'####################'##############
## files to textacy-corpus ## files to textacy-corpi
textacyCorpus = textacy.Corpus(PARSER) textacyCorpus = textacy.Corpus(PARSER)
print("add texts to textacy-corpus...") print("add texts to textacy-corpi...")
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH)) textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))

View File

@ -182,8 +182,8 @@ cleanStream = compose(
cleanEnt cleanEnt
) )
""" """
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpus # content: xml -> stringCleaning -> pipe -> docCleaning -> corpi
# metadata:xml -> -> stringCleaning -> corpus # metadata:xml -> -> stringCleaning -> corpi
corpus = textacy.Corpus(PARSER) corpus = textacy.Corpus(PARSER)

View File

@ -2,27 +2,53 @@
from datetime import datetime from datetime import datetime
print(datetime.now()) print(datetime.now())
from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import *
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
#idee roh-corpus (nur whitespace weg) speichern -> pregeprocesster corpus -> damit arbeiten
path_csv_split = path2de_csv.split("/")
print(path_csv_split[len(path_csv_split) - 1])
path_csv_split = path2en_csv.split("/")
print(path_csv_split[len(path_csv_split) - 1])
import time import time
start = time.time()
from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import time
import logging import logging
from nltk.corpus import stopwords from nltk.corpus import stopwords
@ -40,231 +66,29 @@ csv.field_size_limit(sys.maxsize)
import pickle import pickle
def save_obj(obj, path):
with open(path + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path ):
with open(path + '.pkl', 'rb') as f:
return pickle.load(f)
def load_corpus(corpus_path, corpus_name, lang="de"):
contentpath = corpus_path + corpus_name + "_content.bin"
metapath = corpus_path + corpus_name + "_meta.json"
#load parser
parserpath = corpus_path + str(lang) + '_parser'
parser = spacy.load(parserpath)
corpus = textacy.Corpus(parser)
metadata_stream = textacy.fileio.read_json_lines(metapath)
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "de_raw_ticketCorpus"
print(load_corpus(corpus_path,corpus_name))
# load config
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
# todo configuration file ?
"""
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
with open(config_ini) as f: with open(config_ini) as f:
config.read_file(f) config.read_file(f)
"""
# config logging
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
logging.basicConfig(filename=logfile, level=logging.INFO)
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
# THESAURUS REGEX_SPECIALCHAR = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list" REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
THESAURUS = load_obj(path2thesaurusdict)
# SPELLCHECKING
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
# SPELLCHECKING
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
DE_PARSER = spacy.load("de")
EN_PARSER = spacy.load("en")
de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))
en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(stopwords.words('english'))))
LEMMAS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
NOUNS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
"""
print(de_stop_words[10:30])
print(LEMMAS[10:30])
print(VORNAMEN[10:30])
print(NOUNS[10:30])
"""
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level == "INFO":
logging.info(string)
elif level == "DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
printlog("Load functions")
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
def get_calling_function():
"""finds the calling function in many decent cases.
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
"""
fr = sys._getframe(1) # inspect.stack()[1][0]
co = fr.f_code
for get in (
lambda: fr.f_globals[co.co_name],
lambda: getattr(fr.f_locals['self'], co.co_name),
lambda: getattr(fr.f_locals['cls'], co.co_name),
lambda: fr.f_back.f_locals[co.co_name], # nested
lambda: fr.f_back.f_locals['func'], # decorators
lambda: fr.f_back.f_locals['meth'],
lambda: fr.f_back.f_locals['f'],
):
try:
func = get()
except (KeyError, AttributeError):
pass
else:
if func.__code__ == co:
return func
raise AttributeError("func not found")
def printRandomDoc(textacyCorpus):
import random
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata))
print()
def csv_to_contentStream(path2csv: str, content_collumn_name: str):
"""
:param path2csv: string
:param content_collumn_name: string
:return: string-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
for i, lst in enumerate(stream):
if i == 0:
# look for desired column
for j, col in enumerate(lst):
if col == content_collumn_name:
content_collumn = j
else:
yield lst[content_collumn]
def csv_to_metaStream(path2csv: str, metalist: [str]):
"""
:param path2csv: string
:param metalist: list of strings
:return: dict-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
metaindices = []
metadata_temp = {}
for i, lst in enumerate(stream):
if i == 0:
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
for key in metalist:
if key == col:
metaindices.append(j)
metadata_temp = dict(
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
else:
metadata = metadata_temp.copy()
for key, value in metadata.items():
metadata[key] = lst[value]
yield metadata
THESAURUS = {}
WORDS = {}
LEMMAS = {}
NOUNS = []
VORNAMEN= []
de_stop_words=[]
############# filter tokens ############# filter tokens
@ -303,14 +127,12 @@ def remove_words_containing_Numbers():
return lambda tok: not bool(re.search('\d', tok.lower_)) return lambda tok: not bool(re.search('\d', tok.lower_))
"""
def remove_words_containing_topLVL(): def remove_words_containing_topLVL():
return lambda tok: not bool(re.search(regex_topLvl, tok.lower_)) return lambda tok: not bool(re.search(REGEX_TOPLVL, tok.lower_))
def remove_words_containing_specialCharacters(): def remove_words_containing_specialCharacters():
return lambda tok: not bool(re.search(regex_specialChars, tok.lower_)) return lambda tok: not bool(re.search(REGEX_SPECIALCHAR, tok.lower_))
"""
def remove_long_words(): def remove_long_words():
@ -327,237 +149,28 @@ def remove_first_names():
############# strings ############# strings
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss",
(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
def remove_addresses(string): def remove_addresses(string):
pass # todo pass # todo
def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
"""
def stringcleaning(stringstream, funclist):
for string in stringstream:
for f in funclist:
string = f(string)
yield string
def cut_after(word="gruss"):
return lambda string: string.rpartition(word)[0] if word in string else string
def seperate_words_on_regex(regex=regex_specialChars):
return lambda string: " ".join(re.compile(regex).split(string))
def remove_words_containing_topLVL():
return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
def replaceSpecialChars(replace_with=" "):
return lambda string: re.sub(regex_specialChars, replace_with, string.lower())
def replaceNumbers(replace_with="NUMBER"):
return lambda string : textacy.preprocess.replace_numbers(string.lower(), replace_with=replace_with)
def replacePhonenumbers(replace_with="PHONENUMBER"):
return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
def replaceSharpS(replace_with="ss"):
return lambda string: re.sub(r'[ß]',replace_with,string.lower())
def fixUnicode():
return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
"""
"""
def lemmatizeWord(word,filepath=LEMMAS):
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
if word.lower() == line.split()[1].strip().lower():
return line.split()[0].strip().lower()
return word.lower() # falls nix gefunden wurde
def create_lemma_dicts(lemmalist=LEMMAS):
w_dict = {}
lem_dict = {}
for i, line in enumerate(lemmalist):
try:
lem_word_pair = line.split()
if len(lem_word_pair) != 2:
print(line)
lemma = lem_word_pair[0].strip().lower()
word = lem_word_pair[1].strip().lower()
except:
print(line)
if lemma not in lem_dict:
lem_dict[lemma] = i
if word not in w_dict:
w_dict[word] = lem_dict[lemma]
l_dict = {v: k for k, v in lem_dict.items()} # switch key/values
return l_dict,w_dict
lemma_dict,word_dict = create_lemma_dicts()
def lemmatizeWord(word,l_dict=lemma_dict,w_dict=word_dict):
#mehrmals machen
for i in range(3):
try:
word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower()
except:
print(word)
return word
def lemmatize():
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
def lemmatize():
return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()])
DE_SPELLCHECKER = enchant.Dict("de_DE")
EN_SPELLCHECKER = enchant.Dict("en_US")
def autocorrectWord(word,spellchecker=DE_SPELLCHECKER):
try:
return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word
except:
return word
def autocorrect():
return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()])
"""
def create_lemma_dicts(lemmalist=LEMMAS):
w_dict = {}
lem_dict = {}
for i, line in enumerate(lemmalist):
try:
lem_word_pair = line.split()
if len(lem_word_pair) != 2:
print(line)
lemma = lem_word_pair[0].strip().lower()
word = lem_word_pair[1].strip().lower()
except:
print(line)
if lemma not in lem_dict:
lem_dict[lemma] = i
if word not in w_dict:
w_dict[word] = lem_dict[lemma]
l_dict = {v: k for k, v in lem_dict.items()} # switch key/values
return l_dict, w_dict
lemma_dict, word_dict = create_lemma_dicts()
def lemmatizeWord(word, l_dict=lemma_dict, w_dict=word_dict, n=3):
# mehrmals machen
for i in range(n): for i in range(n):
try: try:
word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower() word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower()
except: except:
print(word) print(word)
return word return word
def build_thesaurus(path2lexicalentries, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
lexroot = lextree.getroot()
synroot = syntree.getroot()
thesaurus = []
for r in synroot:
for element in r:
if element.tag == "Synset":
sysnet = []
attrib = element.attrib
id = attrib["id"]
for ro in lexroot:
for elem in ro:
if elem.tag == "LexicalEntry":
subs_dicts = [subentry.attrib for subentry in elem]
# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict
if "synset" in dic.keys():
if dic["synset"] == id:
string = (dic["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# alle punkte raus
string = re.sub(r'[.]', "", string)
# alles in klammern raus
string = re.sub(r"\((.*)\)", " ", string)
# längeres leerzeichen normalisieren
string = textacy.preprocess.normalize_whitespace(string)
sysnet.append(string.lower().strip())
# nach anzhal der wörter in den strings sortieren
sysnet.sort(key=lambda x: len(x.split()))
if len(sysnet) != 0:
# todo warum sind manche leer?
thesaurus.append(sysnet)
return thesaurus
printlog("Build Thesaurus")
THESAURUS = []
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets)
def getFirstSynonym(word, thesaurus=THESAURUS): def getFirstSynonym(word, thesaurus=THESAURUS):
if not isinstance(word, str): if not isinstance(word, str):
return str(word) return str(word)
word = word.lower() word = word.lower()
# durch den thesaurrus iterieren if word in thesaurus.keys():
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen return thesaurus[word]
else:
return str(word)
for syn in syn_block:
syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist todo phrasen auch normalisieren
if word == syn:
return syn_block[0]
return str(word) # zur Not das ursrpüngliche Wort zurückgeben
########################## Spellchecking ########################################## ########################## Spellchecking ##########################################
@ -570,10 +183,6 @@ from collections import Counter
def words(text): return re.findall(r'\w+', text.lower()) def words(text): return re.findall(r'\w+', text.lower())
printlog("Build Wordlist for Spellchecking")
WORDS = {}
WORDS = Counter(words(open(path2words).read()))
def P(word, N=sum(WORDS.values())): def P(word, N=sum(WORDS.values())):
"Probability of `word`." "Probability of `word`."
return WORDS[word] / N return WORDS[word] / N
@ -610,18 +219,6 @@ def edits2(word):
return (e2 for e1 in edits1(word) for e2 in edits1(e1)) return (e2 for e1 in edits1(word) for e2 in edits1(e1))
"""
DE_SPELLCHECKER = enchant.Dict("de_DE")
EN_SPELLCHECKER = enchant.Dict("en_US")
def autocorrectWord(word, spellchecker=DE_SPELLCHECKER):
try:
return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word
except:
return word
"""
def autocorrectWord(word): def autocorrectWord(word):
try: try:
return correction(word) return correction(word)
@ -629,15 +226,10 @@ def autocorrectWord(word):
return word return word
##################################################################################################
############# stringcleaning ############# stringcleaning
def stringcleaning(stringstream): def stringcleaning(stringstream):
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
regex_topLvl = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
for string in stringstream: for string in stringstream:
string = string.lower() string = string.lower()
@ -646,7 +238,7 @@ def stringcleaning(stringstream):
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
# remove_words_containing_topLVL # remove_words_containing_topLVL
string = " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w)]) string = " ".join([w.lower() for w in string.split() if not re.search(REGEX_TOPLVL, w)])
# replaceRockDots # replaceRockDots
string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ß]', "ss", string)
@ -655,7 +247,7 @@ def stringcleaning(stringstream):
string = re.sub(r'[ä]', "ae", string) string = re.sub(r'[ä]', "ae", string)
# seperate_words_on_regex: # seperate_words_on_regex:
string = " ".join(re.compile(regex_specialChars).split(string)) string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
# cut_after # cut_after
word = "gruss" word = "gruss"
@ -672,8 +264,27 @@ def stringcleaning(stringstream):
yield string yield string
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER): return tokens
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def processContentstream(textstream, parser, token_filterlist=None):
""" """
:param textstream: string-gen :param textstream: string-gen
:param funclist: [func] :param funclist: [func]
@ -681,28 +292,6 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
:return: string-gen :return: string-gen
""" """
"""
filter_tokens=[
#removeENT(["PERSON"]),
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
#idee rechtschreibkorrektur --> PyEnchant
#idee thesaurus --> WordNet, eigener
remove_words_containing_Numbers(),
removePOS(["PUNCT","SPACE","NUM"]),
removeWords(de_stop_words+custom_words),
remove_long_words(),
remove_short_words(),
remove_first_names(),
keepPOS(["NOUN"]),
]
"""
# pre_parse # pre_parse
textstream = stringcleaning(textstream) textstream = stringcleaning(textstream)
@ -720,8 +309,7 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
yield " ".join([tok.lower_ for tok in tokens]) yield " ".join([tok.lower_ for tok in tokens])
# yield " ".join(list(set([tok.lower_ for tok in tokens]))) # yield " ".join(list(set([tok.lower_ for tok in tokens])))
def processDictstream(dictstream, funcdict, parser):
def processDictstream(dictstream, funcdict, parser=DE_PARSER):
""" """
:param dictstream: dict-gen :param dictstream: dict-gen
@ -754,58 +342,34 @@ def processDictstream(dictstream, funcdict, parser=DE_PARSER):
yield result yield result
def filterTokens(tokens, funclist): ##################################################################################################
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
return tokens # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
def cleanString(string):
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# längeres leerzeichen normalisieren path2thesaurus_dict = config.get("thesaurus","pickle_file")
string = textacy.preprocess.normalize_whitespace(string)
return(string) path2wordsdict = config.get("spellchecking", "pickle_file")
def normalizeTextStream(textstream,clean=False): path2lemmadict = config.get("lemmatization","pickle_file")
"""
:param textstream: string-gen
:param parser: spacy-parser
:yield: string-gen
"""
for txt in textstream: path2nouns_list = config.get("nouns","pickle_file")
if clean:
yield cleanString(txt)
else:
yield textacy.preprocess.normalize_whitespace(txt)
def nomalizeDictstream(dictstream, clean=False): path2firstnameslist = config.get("firstnames","pickle_file")
"""
:param dictstream: dict-gen
:param parser: spacy-parser
:yield: dict-gen
"""
for dic in dictstream: path2stopwordlist = config.get("de_stopwords","pickle_file")
result = {}
for key, value in dic.items():
if clean:
result[key] = cleanString(value)
else:
result[key] = textacy.preprocess.normalize_whitespace(value)
yield result
corpus_de_path = config.get("de_corpus", "path")
raw_de_name = config.get("de_corpus", "raw")
pre_de_name = config.get("de_corpus", "pre")
corpus_en_path = config.get("en_corpus", "path")
raw_en_name = config.get("en_corpus", "raw")
pre_en_name = config.get("en_corpus", "pre")
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus", custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
@ -819,6 +383,7 @@ custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanke
"funktionieren", "kollege", "pruefen", "hoffen" "funktionieren", "kollege", "pruefen", "hoffen"
] ]
filter_tokens = [ filter_tokens = [
# removeENT(["PERSON"]), # removeENT(["PERSON"]),
# idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser # idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
@ -829,7 +394,8 @@ filter_tokens = [
removePOS(["PUNCT", "SPACE", "NUM"]), removePOS(["PUNCT", "SPACE", "NUM"]),
removeWords(de_stop_words + custom_words), #removeWords(de_stop_words + custom_words),
removeWords(de_stop_words),
remove_long_words(), remove_long_words(),
remove_short_words(), remove_short_words(),
@ -838,11 +404,7 @@ filter_tokens = [
] ]
metaliste = [
"Subject",
"categoryName",
"Solution"
]
clean_in_meta = { clean_in_meta = {
"Solution": [removePOS(["SPACE"])], "Solution": [removePOS(["SPACE"])],
@ -850,6 +412,78 @@ clean_in_meta = {
"categoryName": [removePOS(["SPACE", "PUNCT"])] "categoryName": [removePOS(["SPACE", "PUNCT"])]
} }
def main():
start = time.time()
printlog("Preprocessing: {0}".format(datetime.now()))
THESAURUS = load_obj(path2thesaurus_dict)
WORDS = load_obj(path2wordsdict)
LEMMAS = load_obj(path2lemmadict)
DE_STOP_WORDS = load_obj(path2stopwordlist)
NOUNS = load_obj(path2nouns_list)
VORNAMEN = load_obj(path2firstnameslist)
#load raw corpus and create new one
raw_de_corpus, DE_PARSER = load_corpus(corpus_name=raw_de_name, corpus_path=corpus_de_path)
raw_en_corpus, EN_PARSER = load_corpus(corpus_name=raw_en_name, corpus_path=corpus_en_path)
de_corpus = textacy.Corpus(DE_PARSER)
en_corpus = textacy.Corpus(EN_PARSER)
## process and add files to textacy-corpi,
printlog("Preprocess and add texts to textacy-corpi")
de_corpus.add_texts(
processContentstream(corpus2Text(raw_de_corpus), token_filterlist=filter_tokens, parser=DE_PARSER),
processDictstream(corpus2Meta(raw_de_corpus), clean_in_meta,parser=raw_de_corpus.lang)
)
en_corpus.add_texts(
processContentstream(corpus2Text(raw_en_corpus), token_filterlist=filter_tokens, parser=EN_PARSER),
processDictstream(corpus2Meta(raw_en_corpus), clean_in_meta,parser=raw_en_corpus.lang)
)
# leere docs aus corpi kicken
de_corpus.remove(lambda doc: len(doc) == 0)
en_corpus.remove(lambda doc: len(doc) == 0)
for i in range(20):
printRandomDoc(de_corpus)
#printRandomDoc(en_corpus)
#save corpi
save_corpus(corpus=de_corpus, corpus_path=corpus_de_path, corpus_name=pre_de_name)
save_corpus(corpus=en_corpus, corpus_path=corpus_en_path, corpus_name=pre_en_name)
end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
if __name__ == "__main__":
main()
""" """
pipe=[ pipe=[
@ -889,37 +523,24 @@ pipe=[
""" """
de_corpus = textacy.Corpus(DE_PARSER) """
en_corpus = textacy.Corpus(EN_PARSER) filter_tokens=[
#removeENT(["PERSON"]),
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
#idee rechtschreibkorrektur --> PyEnchant
#idee thesaurus --> WordNet, eigener
remove_words_containing_Numbers(),
removePOS(["PUNCT","SPACE","NUM"]),
## add files to textacy-corpus, removeWords(de_stop_words+custom_words),
printlog("Add texts to textacy-corpus")
de_corpus.add_texts(
processContentstream(csv_to_contentStream(path2de_csv, "Description"), token_filterlist=filter_tokens),
processDictstream(csv_to_metaStream(path2de_csv, metaliste), clean_in_meta)
)
remove_long_words(),
remove_short_words(),
remove_first_names(),
keepPOS(["NOUN"]),
# leere docs aus corpus kicken ]
de_corpus.remove(lambda doc: len(doc) == 0) """
for i in range(20):
printRandomDoc(de_corpus)
#save corpus
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "de_corpus"
save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name)
#todo das selbe mit en_corpus
end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))

12
test.py
View File

@ -517,8 +517,8 @@ clean_in_content=[
## add files to textacy-corpus, ## add files to textacy-corpi,
printlog("add texts to textacy-corpus") printlog("add texts to textacy-corpi")
ticketcorpus.add_texts( ticketcorpus.add_texts(
processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content), processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta) processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
@ -558,7 +558,7 @@ def label2ID(label,labeldict=LABELDICT):
def generate_labled_lines(textacyCorpus): def generate_labled_lines(textacyCorpus):
for doc in textacyCorpus: for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
@ -596,7 +596,7 @@ n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen
printlog("vectorize corpus...") printlog("vectorize corpi...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
@ -620,8 +620,8 @@ printlog("Initialize and train a topic model..")
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix) model.fit(doc_term_matrix)
#Transform the corpus and interpret our model: #Transform the corpi and interpret our model:
printlog("Transform the corpus and interpret our model..") printlog("Transform the corpi and interpret our model..")
doc_topic_matrix = model.transform(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix)
print() print()

View File

@ -841,15 +841,15 @@ de_corpus = textacy.Corpus(DE_PARSER)
## add files to textacy-corpus, ## add files to textacy-corpi,
printlog("add texts to textacy-corpus") printlog("add texts to textacy-corpi")
de_corpus.add_texts( de_corpus.add_texts(
processContentstream(csv_to_contentStream(path2csv,"Description"), token_filterlist=filter_tokens), processContentstream(csv_to_contentStream(path2csv,"Description"), token_filterlist=filter_tokens),
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta) processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
) )
# leere docs aus corpus kicken # leere docs aus corpi kicken
de_corpus.remove(lambda doc: len(doc)==0) de_corpus.remove(lambda doc: len(doc)==0)
@ -873,7 +873,7 @@ def printvecotorization(ngrams = 1,min_df = 1,max_df = 1.0,weighting ='tf',named
#printlog("vectorize corpus...") #printlog("vectorize corpi...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
@ -908,7 +908,7 @@ printvecotorization(ngrams=(1,2),min_df=1,max_df=0.8,weighting=weighting)
""" """
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
corpus_name = "de_corpus" corpus_name = "de_corpus"
corpus_compression = 'gzip' corpus_compression = 'gzip'
de_corpus.save(corpus_path, name=corpus_name, compression=corpus_compression) de_corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
@ -951,7 +951,7 @@ def topicModeling(ngrams,min_df,max_df,topicModel = 'lda',n_topics = len(LABELDI
####################'#################### ####################'####################
#printlog("vectorize corpus...") #printlog("vectorize corpi...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
@ -971,8 +971,8 @@ def topicModeling(ngrams,min_df,max_df,topicModel = 'lda',n_topics = len(LABELDI
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix) model.fit(doc_term_matrix)
#Transform the corpus and interpret our model: #Transform the corpi and interpret our model:
#printlog("Transform the corpus and interpret our model..") #printlog("Transform the corpi and interpret our model..")
doc_topic_matrix = model.transform(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix)
print() print()
@ -1016,35 +1016,35 @@ topicModeling(ngrams = 1,
max_df = 1.0, max_df = 1.0,
topicModel = 'lda', topicModel = 'lda',
n_topics = len(LABELDICT), n_topics = len(LABELDICT),
corpus=de_corpus) corpi=de_corpus)
topicModeling(ngrams = 1, topicModeling(ngrams = 1,
min_df = 0.1, min_df = 0.1,
max_df = 0.6, max_df = 0.6,
topicModel = 'lda', topicModel = 'lda',
n_topics = len(LABELDICT), n_topics = len(LABELDICT),
corpus=de_corpus) corpi=de_corpus)
topicModeling(ngrams = (1,2), topicModeling(ngrams = (1,2),
min_df = 1, min_df = 1,
max_df = 1.0, max_df = 1.0,
topicModel = 'lda', topicModel = 'lda',
n_topics = len(LABELDICT), n_topics = len(LABELDICT),
corpus=de_corpus) corpi=de_corpus)
topicModeling(ngrams = (1,2), topicModeling(ngrams = (1,2),
min_df = 0.1, min_df = 0.1,
max_df = 0.6, max_df = 0.6,
topicModel = 'lda', topicModel = 'lda',
n_topics = len(LABELDICT), n_topics = len(LABELDICT),
corpus=de_corpus) corpi=de_corpus)
topicModeling(ngrams = (1,2), topicModeling(ngrams = (1,2),
min_df = 0.2, min_df = 0.2,
max_df = 0.8, max_df = 0.8,
topicModel = 'lda', topicModel = 'lda',
n_topics = 20, n_topics = 20,
corpus=de_corpus) corpi=de_corpus)
@ -1124,7 +1124,7 @@ def label2ID(label,labeldict=LABELDICT):
def generate_labled_lines(textacyCorpus): def generate_labled_lines(textacyCorpus):
for doc in textacyCorpus: for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text

120
testra.py
View File

@ -21,7 +21,7 @@ print(datetime.now())
PARSER=spacy.load("de") PARSER=spacy.load("de")
corpus = textacy.Corpus(PARSER) corpi = textacy.Corpus(PARSER)
testcontetn = [ testcontetn = [
"fdsfdsfsd", "fdsfdsfsd",
@ -46,12 +46,12 @@ def makemeta( testmetda):
yield metdata yield metdata
corpus.add_texts( corpi.add_texts(
makecontent(testcontetn), makecontent(testcontetn),
makemeta(testmetda) makemeta(testmetda)
) )
print(corpus) print(corpi)
""" """
@ -66,12 +66,79 @@ def load_obj(path ):
return pickle.load(f) return pickle.load(f)
def load_corpus(corpus_path, corpus_name, lang="de"):
from pathlib import Path
# load parser
parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
parser.vocab.load_lexemes(vocabpath)
corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_content.bin"
metapath = corpus_path + corpus_name + "_meta.json"
metadata_stream = textacy.fileio.read_json_lines(metapath)
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
import os
a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt"
b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt"
d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt"
c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt"
liste = [a,b,c,d]
de_filepaths = [path for path in liste if os.path.basename(path).split("_")[0]=='de' and os.path.basename(path).split("_")[1]=='stopwords']
from nltk.corpus import stopwords as nltk_stopwords
from stop_words import get_stop_words
import spacy
from miscellaneous import *
# from packages
de_stop_words1 = list(get_stop_words("de"))
de_stop_words2 = list(nltk_stopwords.words('german'))
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
# from files
de_stop_words_list = [list(textacy.fileio.read_file_lines(path)) for path in de_filepaths]
de_stop_words4 = [item for sublist in de_stop_words_list for item in sublist]
#print(de_stop_words4)
de_stop_words = list(set(map(replaceRockDots(),list(map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
print(len(de_stop_words))
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testra.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_testra.log &"
# THESAURUS # THESAURUS
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml" lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
def build_thesaurus(path2lexicalentries):#, path2synsets): def build_thesaurus(path2lexicalentries):#, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8")) lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8")) #syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
@ -159,29 +226,6 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
""" """
def load_corpus(corpus_path, corpus_name, lang="de"):
contentpath = corpus_path + corpus_name + "_content.bin"
metapath = corpus_path + corpus_name + "_meta.json"
# load parser
parserpath = corpus_path + str(lang) + '_parser'
parser = spacy.load(parserpath)
corpus = textacy.Corpus(parser)
metadata_stream = textacy.fileio.read_json_lines(metapath)
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
#todo load corpus from file idee stringstore und vocab laden
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "de_raw_ticketCorpus"
print(load_corpus(corpus_path, corpus_name))
""" """
from postal.parser import parse_address from postal.parser import parse_address
@ -197,12 +241,12 @@ print(parse_address(address))
""" """
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
corpus_name = "testcorpus" corpus_name = "testcorpus"
#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression) #corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
#corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression) #corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
@ -225,12 +269,12 @@ def save_corpus(corpus_path,corpus_name):
#save content #save content
contentpath = corpus_path + corpus_name+ "_content.bin" contentpath = corpus_path + corpus_name+ "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus),contentpath) textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpi),contentpath)
#save meta #save meta
metapath = corpus_path + corpus_name +"_meta.json" metapath = corpus_path + corpus_name +"_meta.json"
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath) textacy.fileio.write_json_lines((doc.metadata for doc in corpi), metapath)
@ -243,8 +287,8 @@ def load_corpus(corpus_path,corpus_name):
with open(stringstore_path,"r") as file: with open(stringstore_path,"r") as file:
nlp.vocab.strings.load(file) nlp.vocab.strings.load(file)
# define corpus # define corpi
corpus = textacy.Corpus(nlp) corpi = textacy.Corpus(nlp)
# load meta # load meta
metapath = corpus_path + corpus_name +"_meta.json" metapath = corpus_path + corpus_name +"_meta.json"
@ -252,13 +296,13 @@ def load_corpus(corpus_path,corpus_name):
#load content #load content
contentpath = corpus_path + corpus_name+ "_content.bin" contentpath = corpus_path + corpus_name+ "_content.bin"
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath) spacy_docs = textacy.fileio.read_spacy_docs(corpi.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream): for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc( corpi.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) textacy.Doc(spacy_doc, lang=corpi.spacy_lang, metadata=metadata))
return corpus return corpi
save_corpus(corpus_path,corpus_name) save_corpus(corpus_path,corpus_name)

View File

@ -10,6 +10,46 @@ import time
import enchant import enchant
start = time.time() start = time.time()
from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import *
import time
from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import logging import logging
@ -34,56 +74,6 @@ from postal.parser import parse_address
csv.field_size_limit(sys.maxsize) csv.field_size_limit(sys.maxsize)
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level == "INFO":
logging.info(string)
elif level == "DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
printlog("Load functions")
def printRandomDoc(textacyCorpus):
import random
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata))
print()
def load_corpus(corpus_path,corpus_name):
# load new lang
nlp = spacy.load("de")
#load stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path,"r") as file:
nlp.vocab.strings.load(file)
# define corpus
corpus = textacy.Corpus(nlp)
# load meta
metapath = corpus_path + corpus_name +"_meta.json"
metadata_stream = textacy.fileio.read_json_lines(metapath)
#load content
contentpath = corpus_path + corpus_name+ "_content.bin"
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
@ -92,7 +82,7 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
printlog(str("max_df: {0}".format(max_df))) printlog(str("max_df: {0}".format(max_df)))
printlog(str("named_entities: {0}".format(named_entities))) printlog(str("named_entities: {0}".format(named_entities)))
# printlog("vectorize corpus...") # printlog("vectorize corpi...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
@ -107,10 +97,10 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
corpus_name = "de_corpus" corpus_name = "de_corpus"
# load corpus # load corpi
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path) de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
@ -172,7 +162,7 @@ def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(
####################'#################### ####################'####################
# printlog("vectorize corpus...") # printlog("vectorize corpi...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
@ -191,8 +181,8 @@ def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix) model.fit(doc_term_matrix)
# Transform the corpus and interpret our model: # Transform the corpi and interpret our model:
# printlog("Transform the corpus and interpret our model..") # printlog("Transform the corpi and interpret our model..")
doc_topic_matrix = model.transform(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix)
print() print()
@ -228,35 +218,35 @@ topicModeling(ngrams = 1,
max_df = 1.0, max_df = 1.0,
topicModel = 'lda', topicModel = 'lda',
n_topics = len(LABELDICT), n_topics = len(LABELDICT),
corpus=de_corpus) corpi=de_corpus)
topicModeling(ngrams = 1, topicModeling(ngrams = 1,
min_df = 0.1, min_df = 0.1,
max_df = 0.6, max_df = 0.6,
topicModel = 'lda', topicModel = 'lda',
n_topics = len(LABELDICT), n_topics = len(LABELDICT),
corpus=de_corpus) corpi=de_corpus)
topicModeling(ngrams = (1,2), topicModeling(ngrams = (1,2),
min_df = 1, min_df = 1,
max_df = 1.0, max_df = 1.0,
topicModel = 'lda', topicModel = 'lda',
n_topics = len(LABELDICT), n_topics = len(LABELDICT),
corpus=de_corpus) corpi=de_corpus)
topicModeling(ngrams = (1,2), topicModeling(ngrams = (1,2),
min_df = 0.1, min_df = 0.1,
max_df = 0.6, max_df = 0.6,
topicModel = 'lda', topicModel = 'lda',
n_topics = len(LABELDICT), n_topics = len(LABELDICT),
corpus=de_corpus) corpi=de_corpus)
topicModeling(ngrams = (1,2), topicModeling(ngrams = (1,2),
min_df = 0.2, min_df = 0.2,
max_df = 0.8, max_df = 0.8,
topicModel = 'lda', topicModel = 'lda',
n_topics = 20, n_topics = 20,
corpus=de_corpus) corpi=de_corpus)
@ -292,7 +282,7 @@ def label2ID(label, labeldict=LABELDICT):
def generate_labled_lines(textacyCorpus): def generate_labled_lines(textacyCorpus):
for doc in textacyCorpus: for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text