aufgeräumt
This commit is contained in:
parent
4fe12679fb
commit
56c8bce2d7
70
config.ini
70
config.ini
|
@ -1,21 +1,67 @@
|
|||
[filepath]
|
||||
|
||||
thesauruspath = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv
|
||||
|
||||
path2xml = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ticketSamples.xml
|
||||
|
||||
path2csv = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv
|
||||
|
||||
small = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
|
||||
[thesaurus]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_dict.pkl
|
||||
|
||||
|
||||
logfile = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
|
||||
[spellchecking]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_dict.pkl
|
||||
|
||||
lemmas = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmatization-de.txt
|
||||
|
||||
[lemmatization]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl
|
||||
|
||||
|
||||
[nouns]
|
||||
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt
|
||||
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl
|
||||
|
||||
|
||||
[firstnames]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl
|
||||
|
||||
|
||||
[de_stopwords]
|
||||
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt
|
||||
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt
|
||||
input3 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl
|
||||
|
||||
|
||||
[logging]
|
||||
level = INFO
|
||||
filename = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
|
||||
|
||||
|
||||
[de_corpus]
|
||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv
|
||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
|
||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv
|
||||
|
||||
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
|
||||
raw = de_raw_ticket
|
||||
pre = de_pre_ticket
|
||||
|
||||
|
||||
[en_corpus]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv
|
||||
|
||||
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
|
||||
raw = en_raw_ticket
|
||||
pre = en_pre_ticket
|
||||
|
||||
|
||||
|
||||
[tickets]
|
||||
content_collumn_name = Description
|
||||
metaliste = TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution
|
||||
|
||||
|
||||
|
||||
language = de
|
||||
|
||||
[preprocessing]
|
||||
|
||||
|
|
197
corporization.py
197
corporization.py
|
@ -1,6 +1,35 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from stop_words import get_stop_words
|
||||
|
||||
#import words as words
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
import configparser as ConfigParser
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
import time
|
||||
|
||||
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
|
@ -17,87 +46,15 @@ import sys
|
|||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
|
||||
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||
|
||||
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
|
||||
|
||||
|
||||
content_collumn_name = "Description"
|
||||
|
||||
metaliste = [
|
||||
"TicketNumber",
|
||||
"Subject",
|
||||
"CreatedDate",
|
||||
"categoryName",
|
||||
"Impact",
|
||||
"Urgency",
|
||||
"BenutzerID",
|
||||
"VerantwortlicherID",
|
||||
"EigentuemerID",
|
||||
"Solution"
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_raw_ticketCorpus"
|
||||
|
||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||||
|
||||
|
||||
|
||||
|
||||
# todo configuration file
|
||||
"""
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# config logging
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
import random
|
||||
print()
|
||||
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
|
||||
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
|
||||
|
@ -146,75 +103,93 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
|
|||
yield metadata
|
||||
|
||||
|
||||
def save_corpus(corpus, corpus_path, corpus_name, parser):
|
||||
"""
|
||||
# save stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path, "w") as file:
|
||||
parser.vocab.strings.dump(file)
|
||||
|
||||
#todo save vocab?
|
||||
"""
|
||||
|
||||
# save parser
|
||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||
parser.save_to_directory(parserpath)
|
||||
|
||||
# save content
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
||||
|
||||
# save meta
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||
|
||||
|
||||
|
||||
##################################################################################################
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
|
||||
|
||||
|
||||
"""
|
||||
content_collumn_name = "Description"
|
||||
metaliste = [
|
||||
"TicketNumber",
|
||||
"Subject",
|
||||
"CreatedDate",
|
||||
"categoryName",
|
||||
"Impact",
|
||||
"Urgency",
|
||||
"BenutzerID",
|
||||
"VerantwortlicherID",
|
||||
"EigentuemerID",
|
||||
"Solution"
|
||||
]
|
||||
"""
|
||||
|
||||
|
||||
content_collumn_name = config.get("tickets","content_collumn_name")
|
||||
metaliste = config.get("tickets","metaliste")
|
||||
|
||||
|
||||
path2de_csv = config.get("de_corpus","input")
|
||||
corpus_de_path = config.get("de_corpus", "path")
|
||||
raw_de_name = config.get("de_corpus", "raw")
|
||||
|
||||
|
||||
path2en_csv = config.get("en_corpus","input")
|
||||
corpus_en_path = config.get("en_corpus", "path")
|
||||
raw_en_name = config.get("en_corpus", "raw")
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
start = time.time()
|
||||
printlog("Corporization: {0}".format(datetime.now()))
|
||||
|
||||
|
||||
#print paths
|
||||
path_csv_split = path2de_csv.split("/")
|
||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
||||
path_csv_split = path2en_csv.split("/")
|
||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
||||
|
||||
start = time.time()
|
||||
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
EN_PARSER = spacy.load("en")
|
||||
|
||||
de_corpus = textacy.Corpus(DE_PARSER)
|
||||
en_corpus = textacy.Corpus(EN_PARSER)
|
||||
raw_de_corpus = textacy.Corpus(DE_PARSER)
|
||||
raw_en_corpus = textacy.Corpus(EN_PARSER)
|
||||
|
||||
|
||||
|
||||
## add files to textacy-corpus,
|
||||
printlog("Add texts to textacy-corpus")
|
||||
## add files to textacy-corpi,
|
||||
printlog("Add texts to textacy-corpi")
|
||||
|
||||
de_corpus.add_texts(
|
||||
raw_de_corpus.add_texts(
|
||||
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
|
||||
ticket_csv_to_DictStream(path2de_csv, metaliste)
|
||||
)
|
||||
|
||||
|
||||
# leere docs aus corpus kicken
|
||||
de_corpus.remove(lambda doc: len(doc) == 0)
|
||||
raw_en_corpus.add_texts(
|
||||
ticketcsv_to_textStream(path2en_csv, content_collumn_name),
|
||||
ticket_csv_to_DictStream(path2en_csv, metaliste)
|
||||
)
|
||||
|
||||
|
||||
for i in range(20):
|
||||
printRandomDoc(de_corpus)
|
||||
# leere docs aus corpi kicken
|
||||
raw_de_corpus.remove(lambda doc: len(doc) == 0)
|
||||
raw_en_corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
|
||||
#save corpus
|
||||
#for i in range(20):
|
||||
# printRandomDoc(raw_de_corpus)
|
||||
# printRandomDoc(raw_en_corpus)
|
||||
|
||||
save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name,parser=DE_PARSER)
|
||||
|
||||
#todo das selbe mit en_corpus
|
||||
#save corpi
|
||||
save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name)
|
||||
save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name)
|
||||
|
||||
|
||||
|
||||
|
|
Binary file not shown.
|
@ -1,9 +0,0 @@
|
|||
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
|
||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"subject"}
|
||||
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
|
||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"frau hinrichs überdenkt die situation und macht dann neue anträge . dieses ticket wird geschlossen"}
|
||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"faxnummer 3166 wurde unter die telefonnummer 7179 im elektronischen telefonbuch eingetragen"}
|
||||
{"categoryName":"lan","Subject":"defekte netzwerkdose frage zu vpn","Solution":"hallo herr rauner , die netzwerkdose weist z. z. keine verbindungsprobleme auf . falls doch welche bestehen , melden sie sich bitte bei uns . mit freunldichen grüßen aicha oikrim"}
|
||||
{"categoryName":"betrieb","Subject":"sso login via browser mit zertifikat","Solution":"der login via zertifikat am sso - dienst mittels firefox und unicard sollte funktionieren . eventuell wurden durch ein browserupdate die einstellungen gelöscht . bitte prüfen sie ob die ca - zertifikate installiert sind : https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" und ob das kryptographie modul im firefox hinterlegt ist : https://service.tu-dortmund.de/group/intra/authentifizierung"}
|
||||
{"categoryName":"elektronisches telefonbuch","Subject":"telephone contract","Solution":"erledigt"}
|
||||
{"categoryName":"verwaltung","Subject":"laptop macht komische geräusche","Solution":"herr alexev swetlomier ( hiwi ) küümert sich bereits um das laptop und frau herbst weiß auch bescheid die zur zeit im urlaub ist"}
|
File diff suppressed because one or more lines are too long
339
init.py
339
init.py
|
@ -4,6 +4,9 @@ from datetime import datetime
|
|||
|
||||
import time
|
||||
import logging
|
||||
from stop_words import get_stop_words
|
||||
|
||||
#import words as words
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
|
@ -15,58 +18,35 @@ from scipy import *
|
|||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
import configparser as ConfigParser
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
# todo configuration file ?
|
||||
"""
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
"""
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"
|
||||
|
||||
|
||||
# config logging
|
||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||
|
||||
|
||||
|
||||
def create_lemma_dict(path2lemmalist):
|
||||
"""
|
||||
Creates a dict out of a file a la:
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
EN_PARSER = spacy.load("en")
|
||||
l1 w1
|
||||
l1 w2
|
||||
l2 w1
|
||||
l2 w2
|
||||
|
||||
Result will be used as lemma_dict["word"] --> lemma
|
||||
|
||||
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss",
|
||||
(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
|
||||
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path + '.pkl', 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path ):
|
||||
with open(path + '.pkl', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
def create_lemma_dict(lemmalist):
|
||||
:param path2lemmalist: str
|
||||
:return: dictionary
|
||||
"""
|
||||
lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(
|
||||
textacy.fileio.read_file_lines(path2lemmalist))))
|
||||
|
||||
lemma_dict = {}
|
||||
|
||||
|
@ -81,69 +61,22 @@ def create_lemma_dict(lemmalist):
|
|||
|
||||
return lemma_dict
|
||||
|
||||
"""
|
||||
def build_thesaurus(path2lexicalentries, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
|
||||
def build_thesaurus_dict(path2wordnet,returnall=False):
|
||||
"""
|
||||
Creates a dict out of the deWordNet
|
||||
https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
|
||||
|
||||
Result will be used as lemma_dict["word"] --> lemma
|
||||
|
||||
:param path2lexicalentries: str
|
||||
:param returnall: bool if True, also return , word2synsets, synset2Words
|
||||
:return: dictionaries: thesaurus
|
||||
"""
|
||||
lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
lexroot = lextree.getroot()
|
||||
synroot = syntree.getroot()
|
||||
|
||||
thesaurus = []
|
||||
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
|
||||
if element.tag == "Synset":
|
||||
sysnet = []
|
||||
attrib = element.attrib
|
||||
id = attrib["id"]
|
||||
|
||||
for ro in lexroot:
|
||||
for elem in ro:
|
||||
if elem.tag == "LexicalEntry":
|
||||
subs_dicts = [subentry.attrib for subentry in elem]
|
||||
# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
|
||||
|
||||
dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict
|
||||
if "synset" in dic.keys():
|
||||
if dic["synset"] == id:
|
||||
string = (dic["writtenForm"])
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# alle punkte raus
|
||||
string = re.sub(r'[.]', "", string)
|
||||
|
||||
# alles in klammern raus
|
||||
string = re.sub(r"\((.*)\)", " ", string)
|
||||
|
||||
# längeres leerzeichen normalisieren
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
||||
sysnet.append(string.lower().strip())
|
||||
|
||||
# nach anzhal der wörter in den strings sortieren
|
||||
sysnet.sort(key=lambda x: len(x.split()))
|
||||
if len(sysnet) != 0:
|
||||
# todo warum sind manche leer?
|
||||
thesaurus.append(sysnet)
|
||||
return thesaurus
|
||||
|
||||
#todo thesaurus in dictionary
|
||||
"""
|
||||
|
||||
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
lexroot = lextree.getroot()
|
||||
#synroot = syntree.getroot()
|
||||
|
||||
|
||||
word2synsets = {}
|
||||
template = {"w1": ["s1", "s2"]}
|
||||
|
@ -167,6 +100,9 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|||
if 'writtenForm' in lex_dict.keys():
|
||||
string = (lex_dict["writtenForm"])
|
||||
|
||||
if string == "Kennwort":
|
||||
pass
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
|
@ -186,10 +122,12 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|||
|
||||
word2synsets[string] = synlist
|
||||
|
||||
|
||||
synset2Words = {}
|
||||
template = {"s1": ["w1","w2"]}
|
||||
|
||||
for word,synset in word2synsets.items():
|
||||
if word != '':
|
||||
for syn in synset:
|
||||
if syn not in synset2Words.keys():
|
||||
synset2Words[syn] = [word]
|
||||
|
@ -203,91 +141,135 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|||
thesaurus = {}
|
||||
thesaurus_template = {"w1" : "mainsyn"}
|
||||
|
||||
|
||||
for word,synset in word2synsets.items():
|
||||
try:
|
||||
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
||||
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen?
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if returnall:
|
||||
return thesaurus, word2synsets, synset2Words
|
||||
else:
|
||||
return thesaurus
|
||||
|
||||
|
||||
|
||||
def create_stopword_lists(*paths):
|
||||
"""
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
creates a list of stoppwords from:
|
||||
spacy
|
||||
nltk
|
||||
stop_words
|
||||
|
||||
if element.tag == "Synset":
|
||||
synset = []
|
||||
attrib = element.attrib
|
||||
id = attrib["id"]
|
||||
:param paths: list of additional filepaths where each file looks like
|
||||
w1
|
||||
w2
|
||||
w3
|
||||
filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt
|
||||
|
||||
if id not in synset2Words.keys():
|
||||
synset2Words[id] = "WORD"
|
||||
:return: lists: de_stopwords, en_stopwords
|
||||
"""
|
||||
|
||||
## GERMAN
|
||||
|
||||
# from packages
|
||||
de_stop_words1 = list(get_stop_words("de"))
|
||||
|
||||
de_stop_words2 = list(nltk_stopwords.words('german'))
|
||||
|
||||
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
#from files
|
||||
de_filepaths = []
|
||||
for path in paths:
|
||||
if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[
|
||||
1] == 'stopwords':
|
||||
de_filepaths.append(path)
|
||||
|
||||
|
||||
def create_stopwordlist():
|
||||
de_stop_words4 = list_from_files(*de_filepaths)
|
||||
|
||||
de_stop_words1 = list(map(replaceRockDots(),
|
||||
list(
|
||||
map(textacy.preprocess.normalize_whitespace,
|
||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt")
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german')))))
|
||||
|
||||
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
|
||||
|
||||
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))
|
||||
|
||||
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
|
||||
|
||||
return de_stop_words
|
||||
|
||||
#todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english'))))
|
||||
#combine everything
|
||||
de_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
|
||||
de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
|
||||
|
||||
|
||||
|
||||
## ENGLISH
|
||||
|
||||
# from packages
|
||||
en_stop_words1 = list(get_stop_words("en"))
|
||||
|
||||
en_stop_words2 = list(nltk_stopwords.words('english'))
|
||||
|
||||
en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
# from files
|
||||
en_filepaths = [path for path in paths if
|
||||
os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[
|
||||
1] == 'stopwords']
|
||||
|
||||
en_stop_words4 = list_from_files(*en_filepaths)
|
||||
|
||||
|
||||
########################## Spellchecking ##########################################
|
||||
# http://norvig.com/spell-correct.html
|
||||
# http://wortschatz.uni-leipzig.de/en/download
|
||||
# combine everything
|
||||
en_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
|
||||
en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))
|
||||
|
||||
|
||||
return de_stop_words, en_stop_words
|
||||
|
||||
|
||||
|
||||
def build_words_for_spellchecking(path2words):
|
||||
"""
|
||||
create word-Counter for spellchecking
|
||||
|
||||
http://norvig.com/spell-correct.html
|
||||
http://wortschatz.uni-leipzig.de/en/download
|
||||
|
||||
http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz
|
||||
:return: Counter
|
||||
"""
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
return Counter(words(open(path2words).read()))
|
||||
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
|
||||
##################################################################################################
|
||||
|
||||
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
|
||||
# außerdem saubere stoppwortliste und nomenliste
|
||||
|
||||
|
||||
|
||||
# THESAURUS
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||
#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
||||
|
||||
path2wordnet = config.get("thesaurus","input")
|
||||
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
|
||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
||||
path2words_file = config.get("spellchecking","input")
|
||||
path2wordlist = config.get("spellchecking","pickle_file")
|
||||
|
||||
|
||||
# LEMMA
|
||||
path2lemma_file = config.get("lemmatization","input")
|
||||
path2lemmadict = config.get("lemmatization","pickle_file")
|
||||
|
||||
# NOMEN
|
||||
nouns1 = config.get("nouns","input1")
|
||||
nouns2 = config.get("nouns","input2")
|
||||
path2nouns_list = config.get("nouns","pickle_file")
|
||||
|
||||
|
||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
||||
|
||||
|
||||
|
||||
# VORNAMEN
|
||||
firstnames_txt = config.get("firstnames","input")
|
||||
path2firstnameslist = config.get("firstnames","pickle_file")
|
||||
|
||||
# STOPWORDS
|
||||
stop1 = config.get("de_stopwords","input1")
|
||||
stop2 = config.get("de_stopwords","input2")
|
||||
stop3 = config.get("de_stopwords","input3")
|
||||
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
||||
|
||||
|
||||
|
||||
|
@ -297,71 +279,42 @@ def main():
|
|||
|
||||
|
||||
|
||||
|
||||
printlog("create and save lemma_dict")
|
||||
LEMMAS = list(
|
||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
|
||||
|
||||
lemma_dict = create_lemma_dict(LEMMAS)
|
||||
lemma_dict = create_lemma_dict(path2lemma_file)
|
||||
save_obj(lemma_dict, path2lemmadict)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save Wordlist for Spellchecking")
|
||||
WORDS = Counter(words(open(path2words).read()))
|
||||
save_obj(WORDS, path2wordlist)
|
||||
|
||||
|
||||
words = build_words_for_spellchecking(path2words_file)
|
||||
save_obj(words, path2wordlist)
|
||||
|
||||
|
||||
|
||||
printlog("Build and save Thesaurus")
|
||||
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)
|
||||
|
||||
|
||||
save_obj(THESAURUS, path2thesaurusdict)
|
||||
|
||||
thesaurus = build_thesaurus_dict(path2wordnet)
|
||||
save_obj(thesaurus, path2thesaurus_dict)
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save stoppwortliste")
|
||||
de_stop_words = create_stopwordlist()
|
||||
de_stop_words = create_stopword_lists(stop1, stop2, stop3)
|
||||
save_obj(de_stop_words, path2stopwordlist)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save nomenliste")
|
||||
NOUNS = list(textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(
|
||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
|
||||
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
|
||||
save_obj(NOUNS, path2NOUNSlist)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save fistnameslist")
|
||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
|
||||
|
||||
save_obj(VORNAMEN, path2firstnameslist)
|
||||
|
||||
|
||||
nouns = list_from_files(nouns1,nouns2)
|
||||
save_obj(nouns, path2nouns_list)
|
||||
|
||||
|
||||
printlog("Build and save firstnameslist")
|
||||
vornamen = list_from_files(firstnames_txt)
|
||||
save_obj(vornamen, path2firstnameslist)
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
||||
printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60))
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import init
|
||||
import corporization
|
||||
import preprocessing
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
||||
|
||||
|
||||
init.main()
|
||||
printlog("")
|
||||
|
||||
corporization.main()
|
||||
printlog("")
|
||||
|
||||
preprocessing.main()
|
||||
printlog("")
|
||||
|
|
@ -0,0 +1,281 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import random
|
||||
|
||||
import time
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
import time
|
||||
start = time.time()
|
||||
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
import time
|
||||
|
||||
import enchant
|
||||
|
||||
start = time.time()
|
||||
|
||||
import logging
|
||||
|
||||
import csv
|
||||
import functools
|
||||
import os.path
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
import sys
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
from textacy import Vectorizer
|
||||
import warnings
|
||||
import configparser as ConfigParser
|
||||
import sys
|
||||
import hunspell
|
||||
from postal.parser import parse_address
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
|
||||
|
||||
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
|
||||
|
||||
|
||||
# config logging
|
||||
filename = config.get("logging","filename")
|
||||
level = config.get("logging","level")
|
||||
if level == "INFO":
|
||||
level = logging.INFO
|
||||
elif level == "DEBUG":
|
||||
level = logging.DEBUG
|
||||
elif level == "WARNING":
|
||||
level = logging.WARNING
|
||||
logging.basicConfig(filename=filename, level=level)
|
||||
|
||||
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
|
||||
def compose(*functions):
|
||||
def compose2(f, g):
|
||||
return lambda x: f(g(x))
|
||||
|
||||
return functools.reduce(compose2, functions, lambda x: x)
|
||||
|
||||
|
||||
def get_calling_function():
|
||||
"""finds the calling function in many decent cases.
|
||||
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
|
||||
"""
|
||||
fr = sys._getframe(1) # inspect.stack()[1][0]
|
||||
co = fr.f_code
|
||||
for get in (
|
||||
lambda: fr.f_globals[co.co_name],
|
||||
lambda: getattr(fr.f_locals['self'], co.co_name),
|
||||
lambda: getattr(fr.f_locals['cls'], co.co_name),
|
||||
lambda: fr.f_back.f_locals[co.co_name], # nested
|
||||
lambda: fr.f_back.f_locals['func'], # decorators
|
||||
lambda: fr.f_back.f_locals['meth'],
|
||||
lambda: fr.f_back.f_locals['f'],
|
||||
):
|
||||
try:
|
||||
func = get()
|
||||
except (KeyError, AttributeError):
|
||||
pass
|
||||
else:
|
||||
if func.__code__ == co:
|
||||
return func
|
||||
raise AttributeError("func not found")
|
||||
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path , 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path):
|
||||
with open(path, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss",
|
||||
(re.sub(r'[ö]', "oe",
|
||||
(re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||
|
||||
def list_from_files(*paths):
|
||||
"""
|
||||
create string-list from file like
|
||||
n1
|
||||
n2
|
||||
n3
|
||||
|
||||
:param paths: list(str) or str if single path
|
||||
:return: list(str)
|
||||
"""
|
||||
|
||||
listlist = []
|
||||
for path in paths:
|
||||
listlist.append(list(textacy.fileio.read_file_lines(path)))
|
||||
|
||||
#liste von listen zu einer liste
|
||||
liste = [item for sublist in listlist for item in sublist]
|
||||
|
||||
return list(map(textacy.preprocess.normalize_whitespace, liste))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
"""
|
||||
printlogss random doc out of a textacy-Corpus
|
||||
:param textacyCorpus:
|
||||
"""
|
||||
print()
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def save_corpus(corpus, corpus_path, corpus_name):
|
||||
"""
|
||||
saves a textacy-corpus including spacy-parser
|
||||
:param corpus: textacy-Corpus
|
||||
:param corpus_path: str
|
||||
:param corpus_name: str (should content the language like "_de_")
|
||||
"""
|
||||
|
||||
"""
|
||||
# save stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path, "w") as file:
|
||||
parser.vocab.strings.dump(file)
|
||||
|
||||
#todo save vocab?
|
||||
"""
|
||||
|
||||
# save parser
|
||||
parser = corpus.spacy_lang
|
||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||
parser.save_to_directory(parserpath)
|
||||
|
||||
# save content
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
||||
|
||||
# save meta
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
"""
|
||||
Load textacy-Corpus including spacy-parser out from file
|
||||
:param corpus_path: str
|
||||
:param corpus_name: str (should content the language like "_de_")
|
||||
:param lang: str language code)
|
||||
:return: texracy.Corpus, spacy.language
|
||||
"""
|
||||
|
||||
#ckeck for language
|
||||
if "_de_" in corpus_name:
|
||||
lang="de"
|
||||
elif "_en_" in corpus_name:
|
||||
lang ="en"
|
||||
|
||||
|
||||
# load parser
|
||||
parser = spacy.load(lang)
|
||||
|
||||
|
||||
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
||||
with open(stringstorepath) as file:
|
||||
parser.vocab.strings.load(file)
|
||||
|
||||
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
|
||||
parser.vocab.load_lexemes(vocabpath)
|
||||
|
||||
#load corpus
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
return corpus, corpus.spacy_lang
|
||||
|
||||
|
||||
|
||||
|
|
@ -420,10 +420,10 @@ custom_words = ["grüßen", "fragen"]
|
|||
####################'####################'####################'####################'####################'##############
|
||||
|
||||
|
||||
## files to textacy-corpus
|
||||
## files to textacy-corpi
|
||||
textacyCorpus = textacy.Corpus(PARSER)
|
||||
|
||||
print("add texts to textacy-corpus...")
|
||||
print("add texts to textacy-corpi...")
|
||||
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
|
||||
|
||||
|
||||
|
|
|
@ -182,8 +182,8 @@ cleanStream = compose(
|
|||
cleanEnt
|
||||
)
|
||||
"""
|
||||
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpus
|
||||
# metadata:xml -> -> stringCleaning -> corpus
|
||||
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpi
|
||||
# metadata:xml -> -> stringCleaning -> corpi
|
||||
|
||||
corpus = textacy.Corpus(PARSER)
|
||||
|
||||
|
|
755
preprocessing.py
755
preprocessing.py
|
@ -2,27 +2,53 @@
|
|||
|
||||
from datetime import datetime
|
||||
print(datetime.now())
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from stop_words import get_stop_words
|
||||
|
||||
#import words as words
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
import configparser as ConfigParser
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||
|
||||
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
|
||||
|
||||
|
||||
#idee roh-corpus (nur whitespace weg) speichern -> pregeprocesster corpus -> damit arbeiten
|
||||
|
||||
|
||||
path_csv_split = path2de_csv.split("/")
|
||||
print(path_csv_split[len(path_csv_split) - 1])
|
||||
path_csv_split = path2en_csv.split("/")
|
||||
print(path_csv_split[len(path_csv_split) - 1])
|
||||
|
||||
|
||||
import time
|
||||
start = time.time()
|
||||
|
||||
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
import time
|
||||
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
|
@ -40,231 +66,29 @@ csv.field_size_limit(sys.maxsize)
|
|||
|
||||
import pickle
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path + '.pkl', 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path ):
|
||||
with open(path + '.pkl', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
#load parser
|
||||
parserpath = corpus_path + str(lang) + '_parser'
|
||||
parser = spacy.load(parserpath)
|
||||
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
return corpus
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_raw_ticketCorpus"
|
||||
|
||||
print(load_corpus(corpus_path,corpus_name))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
||||
|
||||
|
||||
|
||||
# todo configuration file ?
|
||||
"""
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# config logging
|
||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
|
||||
|
||||
|
||||
|
||||
|
||||
# THESAURUS
|
||||
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
||||
THESAURUS = load_obj(path2thesaurusdict)
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
||||
|
||||
|
||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
||||
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
|
||||
|
||||
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
EN_PARSER = spacy.load("en")
|
||||
|
||||
|
||||
|
||||
de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))
|
||||
|
||||
en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(stopwords.words('english'))))
|
||||
|
||||
|
||||
|
||||
|
||||
LEMMAS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
|
||||
|
||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
|
||||
|
||||
NOUNS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
|
||||
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
|
||||
|
||||
|
||||
"""
|
||||
print(de_stop_words[10:30])
|
||||
print(LEMMAS[10:30])
|
||||
print(VORNAMEN[10:30])
|
||||
print(NOUNS[10:30])
|
||||
"""
|
||||
|
||||
|
||||
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
||||
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
||||
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
|
||||
|
||||
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
printlog("Load functions")
|
||||
|
||||
|
||||
def compose(*functions):
|
||||
def compose2(f, g):
|
||||
return lambda x: f(g(x))
|
||||
|
||||
return functools.reduce(compose2, functions, lambda x: x)
|
||||
|
||||
|
||||
def get_calling_function():
|
||||
"""finds the calling function in many decent cases.
|
||||
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
|
||||
"""
|
||||
fr = sys._getframe(1) # inspect.stack()[1][0]
|
||||
co = fr.f_code
|
||||
for get in (
|
||||
lambda: fr.f_globals[co.co_name],
|
||||
lambda: getattr(fr.f_locals['self'], co.co_name),
|
||||
lambda: getattr(fr.f_locals['cls'], co.co_name),
|
||||
lambda: fr.f_back.f_locals[co.co_name], # nested
|
||||
lambda: fr.f_back.f_locals['func'], # decorators
|
||||
lambda: fr.f_back.f_locals['meth'],
|
||||
lambda: fr.f_back.f_locals['f'],
|
||||
):
|
||||
try:
|
||||
func = get()
|
||||
except (KeyError, AttributeError):
|
||||
pass
|
||||
else:
|
||||
if func.__code__ == co:
|
||||
return func
|
||||
raise AttributeError("func not found")
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
import random
|
||||
print()
|
||||
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def csv_to_contentStream(path2csv: str, content_collumn_name: str):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param content_collumn_name: string
|
||||
:return: string-generator
|
||||
"""
|
||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||||
content_collumn = 0 # standardvalue
|
||||
|
||||
for i, lst in enumerate(stream):
|
||||
if i == 0:
|
||||
# look for desired column
|
||||
for j, col in enumerate(lst):
|
||||
if col == content_collumn_name:
|
||||
content_collumn = j
|
||||
else:
|
||||
yield lst[content_collumn]
|
||||
|
||||
|
||||
def csv_to_metaStream(path2csv: str, metalist: [str]):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param metalist: list of strings
|
||||
:return: dict-generator
|
||||
"""
|
||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||||
|
||||
content_collumn = 0 # standardvalue
|
||||
metaindices = []
|
||||
metadata_temp = {}
|
||||
for i, lst in enumerate(stream):
|
||||
if i == 0:
|
||||
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
|
||||
for key in metalist:
|
||||
if key == col:
|
||||
metaindices.append(j)
|
||||
metadata_temp = dict(
|
||||
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
|
||||
|
||||
else:
|
||||
metadata = metadata_temp.copy()
|
||||
for key, value in metadata.items():
|
||||
metadata[key] = lst[value]
|
||||
yield metadata
|
||||
|
||||
REGEX_SPECIALCHAR = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
||||
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
||||
|
||||
|
||||
THESAURUS = {}
|
||||
WORDS = {}
|
||||
LEMMAS = {}
|
||||
NOUNS = []
|
||||
VORNAMEN= []
|
||||
de_stop_words=[]
|
||||
|
||||
############# filter tokens
|
||||
|
||||
|
@ -303,14 +127,12 @@ def remove_words_containing_Numbers():
|
|||
return lambda tok: not bool(re.search('\d', tok.lower_))
|
||||
|
||||
|
||||
"""
|
||||
def remove_words_containing_topLVL():
|
||||
return lambda tok: not bool(re.search(regex_topLvl, tok.lower_))
|
||||
return lambda tok: not bool(re.search(REGEX_TOPLVL, tok.lower_))
|
||||
|
||||
|
||||
def remove_words_containing_specialCharacters():
|
||||
return lambda tok: not bool(re.search(regex_specialChars, tok.lower_))
|
||||
"""
|
||||
return lambda tok: not bool(re.search(REGEX_SPECIALCHAR, tok.lower_))
|
||||
|
||||
|
||||
def remove_long_words():
|
||||
|
@ -327,237 +149,28 @@ def remove_first_names():
|
|||
|
||||
############# strings
|
||||
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss",
|
||||
(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||
|
||||
def remove_addresses(string):
|
||||
pass # todo
|
||||
|
||||
|
||||
"""
|
||||
def stringcleaning(stringstream, funclist):
|
||||
for string in stringstream:
|
||||
for f in funclist:
|
||||
|
||||
string = f(string)
|
||||
yield string
|
||||
|
||||
def cut_after(word="gruss"):
|
||||
return lambda string: string.rpartition(word)[0] if word in string else string
|
||||
|
||||
def seperate_words_on_regex(regex=regex_specialChars):
|
||||
return lambda string: " ".join(re.compile(regex).split(string))
|
||||
|
||||
|
||||
|
||||
def remove_words_containing_topLVL():
|
||||
return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
|
||||
|
||||
|
||||
def replaceSpecialChars(replace_with=" "):
|
||||
return lambda string: re.sub(regex_specialChars, replace_with, string.lower())
|
||||
|
||||
|
||||
def replaceNumbers(replace_with="NUMBER"):
|
||||
return lambda string : textacy.preprocess.replace_numbers(string.lower(), replace_with=replace_with)
|
||||
|
||||
|
||||
def replacePhonenumbers(replace_with="PHONENUMBER"):
|
||||
return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
|
||||
|
||||
|
||||
def replaceSharpS(replace_with="ss"):
|
||||
return lambda string: re.sub(r'[ß]',replace_with,string.lower())
|
||||
|
||||
def fixUnicode():
|
||||
return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
||||
"""
|
||||
|
||||
"""
|
||||
def lemmatizeWord(word,filepath=LEMMAS):
|
||||
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
|
||||
if word.lower() == line.split()[1].strip().lower():
|
||||
return line.split()[0].strip().lower()
|
||||
return word.lower() # falls nix gefunden wurde
|
||||
|
||||
|
||||
def create_lemma_dicts(lemmalist=LEMMAS):
|
||||
w_dict = {}
|
||||
lem_dict = {}
|
||||
|
||||
for i, line in enumerate(lemmalist):
|
||||
try:
|
||||
lem_word_pair = line.split()
|
||||
|
||||
if len(lem_word_pair) != 2:
|
||||
print(line)
|
||||
|
||||
lemma = lem_word_pair[0].strip().lower()
|
||||
|
||||
word = lem_word_pair[1].strip().lower()
|
||||
except:
|
||||
print(line)
|
||||
|
||||
if lemma not in lem_dict:
|
||||
lem_dict[lemma] = i
|
||||
|
||||
if word not in w_dict:
|
||||
w_dict[word] = lem_dict[lemma]
|
||||
|
||||
l_dict = {v: k for k, v in lem_dict.items()} # switch key/values
|
||||
|
||||
return l_dict,w_dict
|
||||
|
||||
lemma_dict,word_dict = create_lemma_dicts()
|
||||
|
||||
|
||||
|
||||
def lemmatizeWord(word,l_dict=lemma_dict,w_dict=word_dict):
|
||||
#mehrmals machen
|
||||
for i in range(3):
|
||||
try:
|
||||
word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower()
|
||||
except:
|
||||
print(word)
|
||||
return word
|
||||
|
||||
def lemmatize():
|
||||
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
|
||||
|
||||
def lemmatize():
|
||||
return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()])
|
||||
|
||||
DE_SPELLCHECKER = enchant.Dict("de_DE")
|
||||
EN_SPELLCHECKER = enchant.Dict("en_US")
|
||||
|
||||
def autocorrectWord(word,spellchecker=DE_SPELLCHECKER):
|
||||
|
||||
try:
|
||||
return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word
|
||||
except:
|
||||
return word
|
||||
|
||||
|
||||
|
||||
def autocorrect():
|
||||
return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()])
|
||||
"""
|
||||
|
||||
|
||||
def create_lemma_dicts(lemmalist=LEMMAS):
|
||||
w_dict = {}
|
||||
lem_dict = {}
|
||||
|
||||
for i, line in enumerate(lemmalist):
|
||||
try:
|
||||
lem_word_pair = line.split()
|
||||
|
||||
if len(lem_word_pair) != 2:
|
||||
print(line)
|
||||
|
||||
lemma = lem_word_pair[0].strip().lower()
|
||||
|
||||
word = lem_word_pair[1].strip().lower()
|
||||
except:
|
||||
print(line)
|
||||
|
||||
if lemma not in lem_dict:
|
||||
lem_dict[lemma] = i
|
||||
|
||||
if word not in w_dict:
|
||||
w_dict[word] = lem_dict[lemma]
|
||||
|
||||
l_dict = {v: k for k, v in lem_dict.items()} # switch key/values
|
||||
|
||||
return l_dict, w_dict
|
||||
|
||||
|
||||
lemma_dict, word_dict = create_lemma_dicts()
|
||||
|
||||
def lemmatizeWord(word, l_dict=lemma_dict, w_dict=word_dict, n=3):
|
||||
# mehrmals machen
|
||||
def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
|
||||
for i in range(n):
|
||||
try:
|
||||
word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower()
|
||||
word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower()
|
||||
except:
|
||||
print(word)
|
||||
return word
|
||||
|
||||
|
||||
def build_thesaurus(path2lexicalentries, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
lexroot = lextree.getroot()
|
||||
synroot = syntree.getroot()
|
||||
|
||||
thesaurus = []
|
||||
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
|
||||
if element.tag == "Synset":
|
||||
sysnet = []
|
||||
attrib = element.attrib
|
||||
id = attrib["id"]
|
||||
|
||||
for ro in lexroot:
|
||||
for elem in ro:
|
||||
if elem.tag == "LexicalEntry":
|
||||
subs_dicts = [subentry.attrib for subentry in elem]
|
||||
# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
|
||||
|
||||
dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict
|
||||
if "synset" in dic.keys():
|
||||
if dic["synset"] == id:
|
||||
string = (dic["writtenForm"])
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# alle punkte raus
|
||||
string = re.sub(r'[.]', "", string)
|
||||
|
||||
# alles in klammern raus
|
||||
string = re.sub(r"\((.*)\)", " ", string)
|
||||
|
||||
# längeres leerzeichen normalisieren
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
||||
sysnet.append(string.lower().strip())
|
||||
|
||||
# nach anzhal der wörter in den strings sortieren
|
||||
sysnet.sort(key=lambda x: len(x.split()))
|
||||
if len(sysnet) != 0:
|
||||
# todo warum sind manche leer?
|
||||
thesaurus.append(sysnet)
|
||||
return thesaurus
|
||||
|
||||
printlog("Build Thesaurus")
|
||||
THESAURUS = []
|
||||
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets)
|
||||
|
||||
|
||||
def getFirstSynonym(word, thesaurus=THESAURUS):
|
||||
if not isinstance(word, str):
|
||||
return str(word)
|
||||
|
||||
word = word.lower()
|
||||
|
||||
# durch den thesaurrus iterieren
|
||||
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
|
||||
if word in thesaurus.keys():
|
||||
return thesaurus[word]
|
||||
else:
|
||||
return str(word)
|
||||
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist todo phrasen auch normalisieren
|
||||
if word == syn:
|
||||
return syn_block[0]
|
||||
|
||||
return str(word) # zur Not das ursrpüngliche Wort zurückgeben
|
||||
|
||||
|
||||
########################## Spellchecking ##########################################
|
||||
|
@ -570,10 +183,6 @@ from collections import Counter
|
|||
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
printlog("Build Wordlist for Spellchecking")
|
||||
WORDS = {}
|
||||
WORDS = Counter(words(open(path2words).read()))
|
||||
|
||||
def P(word, N=sum(WORDS.values())):
|
||||
"Probability of `word`."
|
||||
return WORDS[word] / N
|
||||
|
@ -610,18 +219,6 @@ def edits2(word):
|
|||
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
||||
|
||||
|
||||
"""
|
||||
DE_SPELLCHECKER = enchant.Dict("de_DE")
|
||||
EN_SPELLCHECKER = enchant.Dict("en_US")
|
||||
|
||||
def autocorrectWord(word, spellchecker=DE_SPELLCHECKER):
|
||||
try:
|
||||
return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word
|
||||
except:
|
||||
return word
|
||||
"""
|
||||
|
||||
|
||||
def autocorrectWord(word):
|
||||
try:
|
||||
return correction(word)
|
||||
|
@ -629,15 +226,10 @@ def autocorrectWord(word):
|
|||
return word
|
||||
|
||||
|
||||
##################################################################################################
|
||||
|
||||
|
||||
|
||||
############# stringcleaning
|
||||
|
||||
def stringcleaning(stringstream):
|
||||
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
||||
regex_topLvl = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
||||
|
||||
|
||||
for string in stringstream:
|
||||
string = string.lower()
|
||||
|
@ -646,7 +238,7 @@ def stringcleaning(stringstream):
|
|||
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
||||
|
||||
# remove_words_containing_topLVL
|
||||
string = " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w)])
|
||||
string = " ".join([w.lower() for w in string.split() if not re.search(REGEX_TOPLVL, w)])
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
|
@ -655,7 +247,7 @@ def stringcleaning(stringstream):
|
|||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# seperate_words_on_regex:
|
||||
string = " ".join(re.compile(regex_specialChars).split(string))
|
||||
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
|
||||
|
||||
# cut_after
|
||||
word = "gruss"
|
||||
|
@ -672,8 +264,27 @@ def stringcleaning(stringstream):
|
|||
|
||||
yield string
|
||||
|
||||
def filterTokens(tokens, funclist):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
for f in funclist:
|
||||
tokens = list(filter(f, tokens))
|
||||
|
||||
def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
|
||||
return tokens
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
def corpus2Meta(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.metadata
|
||||
|
||||
def processContentstream(textstream, parser, token_filterlist=None):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
:param funclist: [func]
|
||||
|
@ -681,28 +292,6 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
|
|||
:return: string-gen
|
||||
"""
|
||||
|
||||
"""
|
||||
filter_tokens=[
|
||||
#removeENT(["PERSON"]),
|
||||
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
||||
#idee rechtschreibkorrektur --> PyEnchant
|
||||
#idee thesaurus --> WordNet, eigener
|
||||
|
||||
remove_words_containing_Numbers(),
|
||||
|
||||
removePOS(["PUNCT","SPACE","NUM"]),
|
||||
|
||||
removeWords(de_stop_words+custom_words),
|
||||
|
||||
remove_long_words(),
|
||||
remove_short_words(),
|
||||
remove_first_names(),
|
||||
|
||||
keepPOS(["NOUN"]),
|
||||
|
||||
]
|
||||
"""
|
||||
|
||||
# pre_parse
|
||||
textstream = stringcleaning(textstream)
|
||||
|
||||
|
@ -720,8 +309,7 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
|
|||
yield " ".join([tok.lower_ for tok in tokens])
|
||||
# yield " ".join(list(set([tok.lower_ for tok in tokens])))
|
||||
|
||||
|
||||
def processDictstream(dictstream, funcdict, parser=DE_PARSER):
|
||||
def processDictstream(dictstream, funcdict, parser):
|
||||
"""
|
||||
|
||||
:param dictstream: dict-gen
|
||||
|
@ -754,58 +342,34 @@ def processDictstream(dictstream, funcdict, parser=DE_PARSER):
|
|||
yield result
|
||||
|
||||
|
||||
def filterTokens(tokens, funclist):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
for f in funclist:
|
||||
tokens = list(filter(f, tokens))
|
||||
##################################################################################################
|
||||
|
||||
return tokens
|
||||
|
||||
def cleanString(string):
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
||||
|
||||
|
||||
# längeres leerzeichen normalisieren
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
||||
|
||||
return(string)
|
||||
path2wordsdict = config.get("spellchecking", "pickle_file")
|
||||
|
||||
def normalizeTextStream(textstream,clean=False):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
:param parser: spacy-parser
|
||||
:yield: string-gen
|
||||
"""
|
||||
path2lemmadict = config.get("lemmatization","pickle_file")
|
||||
|
||||
for txt in textstream:
|
||||
if clean:
|
||||
yield cleanString(txt)
|
||||
else:
|
||||
yield textacy.preprocess.normalize_whitespace(txt)
|
||||
path2nouns_list = config.get("nouns","pickle_file")
|
||||
|
||||
def nomalizeDictstream(dictstream, clean=False):
|
||||
"""
|
||||
:param dictstream: dict-gen
|
||||
:param parser: spacy-parser
|
||||
:yield: dict-gen
|
||||
"""
|
||||
path2firstnameslist = config.get("firstnames","pickle_file")
|
||||
|
||||
for dic in dictstream:
|
||||
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
||||
|
||||
result = {}
|
||||
|
||||
for key, value in dic.items():
|
||||
if clean:
|
||||
result[key] = cleanString(value)
|
||||
else:
|
||||
result[key] = textacy.preprocess.normalize_whitespace(value)
|
||||
yield result
|
||||
|
||||
corpus_de_path = config.get("de_corpus", "path")
|
||||
raw_de_name = config.get("de_corpus", "raw")
|
||||
pre_de_name = config.get("de_corpus", "pre")
|
||||
|
||||
|
||||
|
||||
corpus_en_path = config.get("en_corpus", "path")
|
||||
raw_en_name = config.get("en_corpus", "raw")
|
||||
pre_en_name = config.get("en_corpus", "pre")
|
||||
|
||||
|
||||
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
||||
|
@ -819,6 +383,7 @@ custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanke
|
|||
"funktionieren", "kollege", "pruefen", "hoffen"
|
||||
]
|
||||
|
||||
|
||||
filter_tokens = [
|
||||
# removeENT(["PERSON"]),
|
||||
# idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
||||
|
@ -829,7 +394,8 @@ filter_tokens = [
|
|||
|
||||
removePOS(["PUNCT", "SPACE", "NUM"]),
|
||||
|
||||
removeWords(de_stop_words + custom_words),
|
||||
#removeWords(de_stop_words + custom_words),
|
||||
removeWords(de_stop_words),
|
||||
|
||||
remove_long_words(),
|
||||
remove_short_words(),
|
||||
|
@ -838,11 +404,7 @@ filter_tokens = [
|
|||
|
||||
]
|
||||
|
||||
metaliste = [
|
||||
"Subject",
|
||||
"categoryName",
|
||||
"Solution"
|
||||
]
|
||||
|
||||
|
||||
clean_in_meta = {
|
||||
"Solution": [removePOS(["SPACE"])],
|
||||
|
@ -850,6 +412,78 @@ clean_in_meta = {
|
|||
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
start = time.time()
|
||||
printlog("Preprocessing: {0}".format(datetime.now()))
|
||||
|
||||
|
||||
THESAURUS = load_obj(path2thesaurus_dict)
|
||||
WORDS = load_obj(path2wordsdict)
|
||||
LEMMAS = load_obj(path2lemmadict)
|
||||
DE_STOP_WORDS = load_obj(path2stopwordlist)
|
||||
NOUNS = load_obj(path2nouns_list)
|
||||
VORNAMEN = load_obj(path2firstnameslist)
|
||||
|
||||
|
||||
|
||||
#load raw corpus and create new one
|
||||
raw_de_corpus, DE_PARSER = load_corpus(corpus_name=raw_de_name, corpus_path=corpus_de_path)
|
||||
raw_en_corpus, EN_PARSER = load_corpus(corpus_name=raw_en_name, corpus_path=corpus_en_path)
|
||||
|
||||
de_corpus = textacy.Corpus(DE_PARSER)
|
||||
en_corpus = textacy.Corpus(EN_PARSER)
|
||||
|
||||
|
||||
|
||||
## process and add files to textacy-corpi,
|
||||
printlog("Preprocess and add texts to textacy-corpi")
|
||||
de_corpus.add_texts(
|
||||
processContentstream(corpus2Text(raw_de_corpus), token_filterlist=filter_tokens, parser=DE_PARSER),
|
||||
processDictstream(corpus2Meta(raw_de_corpus), clean_in_meta,parser=raw_de_corpus.lang)
|
||||
)
|
||||
en_corpus.add_texts(
|
||||
processContentstream(corpus2Text(raw_en_corpus), token_filterlist=filter_tokens, parser=EN_PARSER),
|
||||
processDictstream(corpus2Meta(raw_en_corpus), clean_in_meta,parser=raw_en_corpus.lang)
|
||||
)
|
||||
|
||||
|
||||
# leere docs aus corpi kicken
|
||||
de_corpus.remove(lambda doc: len(doc) == 0)
|
||||
en_corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
|
||||
for i in range(20):
|
||||
printRandomDoc(de_corpus)
|
||||
#printRandomDoc(en_corpus)
|
||||
|
||||
|
||||
|
||||
#save corpi
|
||||
save_corpus(corpus=de_corpus, corpus_path=corpus_de_path, corpus_name=pre_de_name)
|
||||
save_corpus(corpus=en_corpus, corpus_path=corpus_en_path, corpus_name=pre_en_name)
|
||||
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
pipe=[
|
||||
|
||||
|
@ -889,37 +523,24 @@ pipe=[
|
|||
|
||||
"""
|
||||
|
||||
de_corpus = textacy.Corpus(DE_PARSER)
|
||||
en_corpus = textacy.Corpus(EN_PARSER)
|
||||
"""
|
||||
filter_tokens=[
|
||||
#removeENT(["PERSON"]),
|
||||
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
||||
#idee rechtschreibkorrektur --> PyEnchant
|
||||
#idee thesaurus --> WordNet, eigener
|
||||
|
||||
remove_words_containing_Numbers(),
|
||||
|
||||
removePOS(["PUNCT","SPACE","NUM"]),
|
||||
|
||||
## add files to textacy-corpus,
|
||||
printlog("Add texts to textacy-corpus")
|
||||
de_corpus.add_texts(
|
||||
processContentstream(csv_to_contentStream(path2de_csv, "Description"), token_filterlist=filter_tokens),
|
||||
processDictstream(csv_to_metaStream(path2de_csv, metaliste), clean_in_meta)
|
||||
)
|
||||
removeWords(de_stop_words+custom_words),
|
||||
|
||||
remove_long_words(),
|
||||
remove_short_words(),
|
||||
remove_first_names(),
|
||||
|
||||
keepPOS(["NOUN"]),
|
||||
|
||||
# leere docs aus corpus kicken
|
||||
de_corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
for i in range(20):
|
||||
printRandomDoc(de_corpus)
|
||||
|
||||
|
||||
#save corpus
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_corpus"
|
||||
|
||||
save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name)
|
||||
|
||||
|
||||
#todo das selbe mit en_corpus
|
||||
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
||||
]
|
||||
"""
|
||||
|
|
12
test.py
12
test.py
|
@ -517,8 +517,8 @@ clean_in_content=[
|
|||
|
||||
|
||||
|
||||
## add files to textacy-corpus,
|
||||
printlog("add texts to textacy-corpus")
|
||||
## add files to textacy-corpi,
|
||||
printlog("add texts to textacy-corpi")
|
||||
ticketcorpus.add_texts(
|
||||
processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
|
||||
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
|
||||
|
@ -558,7 +558,7 @@ def label2ID(label,labeldict=LABELDICT):
|
|||
|
||||
def generate_labled_lines(textacyCorpus):
|
||||
for doc in textacyCorpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
||||
|
||||
|
||||
|
@ -596,7 +596,7 @@ n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen
|
|||
|
||||
|
||||
|
||||
printlog("vectorize corpus...")
|
||||
printlog("vectorize corpi...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
|
||||
|
@ -620,8 +620,8 @@ printlog("Initialize and train a topic model..")
|
|||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||
model.fit(doc_term_matrix)
|
||||
|
||||
#Transform the corpus and interpret our model:
|
||||
printlog("Transform the corpus and interpret our model..")
|
||||
#Transform the corpi and interpret our model:
|
||||
printlog("Transform the corpi and interpret our model..")
|
||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||
print()
|
||||
|
||||
|
|
28
testo.py
28
testo.py
|
@ -841,15 +841,15 @@ de_corpus = textacy.Corpus(DE_PARSER)
|
|||
|
||||
|
||||
|
||||
## add files to textacy-corpus,
|
||||
printlog("add texts to textacy-corpus")
|
||||
## add files to textacy-corpi,
|
||||
printlog("add texts to textacy-corpi")
|
||||
de_corpus.add_texts(
|
||||
processContentstream(csv_to_contentStream(path2csv,"Description"), token_filterlist=filter_tokens),
|
||||
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
|
||||
)
|
||||
|
||||
|
||||
# leere docs aus corpus kicken
|
||||
# leere docs aus corpi kicken
|
||||
de_corpus.remove(lambda doc: len(doc)==0)
|
||||
|
||||
|
||||
|
@ -873,7 +873,7 @@ def printvecotorization(ngrams = 1,min_df = 1,max_df = 1.0,weighting ='tf',named
|
|||
|
||||
|
||||
|
||||
#printlog("vectorize corpus...")
|
||||
#printlog("vectorize corpi...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
|
||||
|
@ -908,7 +908,7 @@ printvecotorization(ngrams=(1,2),min_df=1,max_df=0.8,weighting=weighting)
|
|||
|
||||
|
||||
"""
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||
corpus_name = "de_corpus"
|
||||
corpus_compression = 'gzip'
|
||||
de_corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
|
@ -951,7 +951,7 @@ def topicModeling(ngrams,min_df,max_df,topicModel = 'lda',n_topics = len(LABELDI
|
|||
####################'####################
|
||||
|
||||
|
||||
#printlog("vectorize corpus...")
|
||||
#printlog("vectorize corpi...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
|
||||
|
@ -971,8 +971,8 @@ def topicModeling(ngrams,min_df,max_df,topicModel = 'lda',n_topics = len(LABELDI
|
|||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||
model.fit(doc_term_matrix)
|
||||
|
||||
#Transform the corpus and interpret our model:
|
||||
#printlog("Transform the corpus and interpret our model..")
|
||||
#Transform the corpi and interpret our model:
|
||||
#printlog("Transform the corpi and interpret our model..")
|
||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||
print()
|
||||
|
||||
|
@ -1016,35 +1016,35 @@ topicModeling(ngrams = 1,
|
|||
max_df = 1.0,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
corpi=de_corpus)
|
||||
|
||||
topicModeling(ngrams = 1,
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
corpi=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 1.0,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
corpi=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
corpi=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 0.2,
|
||||
max_df = 0.8,
|
||||
topicModel = 'lda',
|
||||
n_topics = 20,
|
||||
corpus=de_corpus)
|
||||
corpi=de_corpus)
|
||||
|
||||
|
||||
|
||||
|
@ -1124,7 +1124,7 @@ def label2ID(label,labeldict=LABELDICT):
|
|||
|
||||
def generate_labled_lines(textacyCorpus):
|
||||
for doc in textacyCorpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
||||
|
||||
|
||||
|
|
120
testra.py
120
testra.py
|
@ -21,7 +21,7 @@ print(datetime.now())
|
|||
PARSER=spacy.load("de")
|
||||
|
||||
|
||||
corpus = textacy.Corpus(PARSER)
|
||||
corpi = textacy.Corpus(PARSER)
|
||||
|
||||
testcontetn = [
|
||||
"fdsfdsfsd",
|
||||
|
@ -46,12 +46,12 @@ def makemeta( testmetda):
|
|||
yield metdata
|
||||
|
||||
|
||||
corpus.add_texts(
|
||||
corpi.add_texts(
|
||||
makecontent(testcontetn),
|
||||
makemeta(testmetda)
|
||||
)
|
||||
|
||||
print(corpus)
|
||||
print(corpi)
|
||||
"""
|
||||
|
||||
|
||||
|
@ -66,12 +66,79 @@ def load_obj(path ):
|
|||
return pickle.load(f)
|
||||
|
||||
|
||||
|
||||
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
from pathlib import Path
|
||||
|
||||
# load parser
|
||||
parser = spacy.load(lang)
|
||||
|
||||
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
||||
with open(stringstorepath) as file:
|
||||
parser.vocab.strings.load(file)
|
||||
|
||||
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
|
||||
parser.vocab.load_lexemes(vocabpath)
|
||||
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
|
||||
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
return corpus
|
||||
|
||||
import os
|
||||
a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt"
|
||||
b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt"
|
||||
d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt"
|
||||
|
||||
c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt"
|
||||
|
||||
liste = [a,b,c,d]
|
||||
de_filepaths = [path for path in liste if os.path.basename(path).split("_")[0]=='de' and os.path.basename(path).split("_")[1]=='stopwords']
|
||||
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
|
||||
from stop_words import get_stop_words
|
||||
import spacy
|
||||
from miscellaneous import *
|
||||
# from packages
|
||||
de_stop_words1 = list(get_stop_words("de"))
|
||||
|
||||
de_stop_words2 = list(nltk_stopwords.words('german'))
|
||||
|
||||
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
# from files
|
||||
de_stop_words_list = [list(textacy.fileio.read_file_lines(path)) for path in de_filepaths]
|
||||
de_stop_words4 = [item for sublist in de_stop_words_list for item in sublist]
|
||||
#print(de_stop_words4)
|
||||
|
||||
de_stop_words = list(set(map(replaceRockDots(),list(map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
|
||||
print(len(de_stop_words))
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testra.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_testra.log &"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# THESAURUS
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
|
||||
|
||||
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
@ -159,29 +226,6 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|||
"""
|
||||
|
||||
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
# load parser
|
||||
parserpath = corpus_path + str(lang) + '_parser'
|
||||
parser = spacy.load(parserpath)
|
||||
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
return corpus
|
||||
|
||||
#todo load corpus from file idee stringstore und vocab laden
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_raw_ticketCorpus"
|
||||
|
||||
print(load_corpus(corpus_path, corpus_name))
|
||||
|
||||
"""
|
||||
from postal.parser import parse_address
|
||||
|
@ -197,12 +241,12 @@ print(parse_address(address))
|
|||
|
||||
"""
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||
corpus_name = "testcorpus"
|
||||
|
||||
|
||||
#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
#corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
#corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
#corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
|
||||
|
||||
|
||||
|
@ -225,12 +269,12 @@ def save_corpus(corpus_path,corpus_name):
|
|||
|
||||
#save content
|
||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus),contentpath)
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpi),contentpath)
|
||||
|
||||
|
||||
#save meta
|
||||
metapath = corpus_path + corpus_name +"_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpi), metapath)
|
||||
|
||||
|
||||
|
||||
|
@ -243,8 +287,8 @@ def load_corpus(corpus_path,corpus_name):
|
|||
with open(stringstore_path,"r") as file:
|
||||
nlp.vocab.strings.load(file)
|
||||
|
||||
# define corpus
|
||||
corpus = textacy.Corpus(nlp)
|
||||
# define corpi
|
||||
corpi = textacy.Corpus(nlp)
|
||||
|
||||
# load meta
|
||||
metapath = corpus_path + corpus_name +"_meta.json"
|
||||
|
@ -252,13 +296,13 @@ def load_corpus(corpus_path,corpus_name):
|
|||
|
||||
#load content
|
||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpi.spacy_vocab, contentpath)
|
||||
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
corpi.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpi.spacy_lang, metadata=metadata))
|
||||
|
||||
return corpus
|
||||
return corpi
|
||||
|
||||
|
||||
save_corpus(corpus_path,corpus_name)
|
||||
|
|
114
topicModeling.py
114
topicModeling.py
|
@ -10,6 +10,46 @@ import time
|
|||
import enchant
|
||||
|
||||
start = time.time()
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from stop_words import get_stop_words
|
||||
|
||||
#import words as words
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
import configparser as ConfigParser
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
|
||||
import time
|
||||
|
||||
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -34,56 +74,6 @@ from postal.parser import parse_address
|
|||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
|
||||
printlog("Load functions")
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
import random
|
||||
print()
|
||||
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def load_corpus(corpus_path,corpus_name):
|
||||
# load new lang
|
||||
nlp = spacy.load("de")
|
||||
|
||||
#load stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path,"r") as file:
|
||||
nlp.vocab.strings.load(file)
|
||||
|
||||
# define corpus
|
||||
corpus = textacy.Corpus(nlp)
|
||||
|
||||
# load meta
|
||||
metapath = corpus_path + corpus_name +"_meta.json"
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
|
||||
#load content
|
||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
|
||||
return corpus
|
||||
|
||||
|
||||
def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
|
||||
|
@ -92,7 +82,7 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
|
|||
printlog(str("max_df: {0}".format(max_df)))
|
||||
printlog(str("named_entities: {0}".format(named_entities)))
|
||||
|
||||
# printlog("vectorize corpus...")
|
||||
# printlog("vectorize corpi...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
|
||||
|
@ -107,10 +97,10 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
|
|||
|
||||
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||
corpus_name = "de_corpus"
|
||||
|
||||
# load corpus
|
||||
# load corpi
|
||||
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
|
||||
|
||||
|
||||
|
@ -172,7 +162,7 @@ def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(
|
|||
####################'####################
|
||||
|
||||
|
||||
# printlog("vectorize corpus...")
|
||||
# printlog("vectorize corpi...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
|
||||
|
@ -191,8 +181,8 @@ def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(
|
|||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||
model.fit(doc_term_matrix)
|
||||
|
||||
# Transform the corpus and interpret our model:
|
||||
# printlog("Transform the corpus and interpret our model..")
|
||||
# Transform the corpi and interpret our model:
|
||||
# printlog("Transform the corpi and interpret our model..")
|
||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||
print()
|
||||
|
||||
|
@ -228,35 +218,35 @@ topicModeling(ngrams = 1,
|
|||
max_df = 1.0,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
corpi=de_corpus)
|
||||
|
||||
topicModeling(ngrams = 1,
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
corpi=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 1.0,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
corpi=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
corpi=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 0.2,
|
||||
max_df = 0.8,
|
||||
topicModel = 'lda',
|
||||
n_topics = 20,
|
||||
corpus=de_corpus)
|
||||
corpi=de_corpus)
|
||||
|
||||
|
||||
|
||||
|
@ -292,7 +282,7 @@ def label2ID(label, labeldict=LABELDICT):
|
|||
|
||||
def generate_labled_lines(textacyCorpus):
|
||||
for doc in textacyCorpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue