aufgeräumt
This commit is contained in:
parent
4fe12679fb
commit
56c8bce2d7
70
config.ini
70
config.ini
|
@ -1,21 +1,67 @@
|
||||||
[filepath]
|
[thesaurus]
|
||||||
|
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml
|
||||||
thesauruspath = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv
|
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_dict.pkl
|
||||||
|
|
||||||
path2xml = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ticketSamples.xml
|
|
||||||
|
|
||||||
path2csv = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv
|
|
||||||
|
|
||||||
small = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
|
|
||||||
|
|
||||||
|
|
||||||
logfile = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
|
[spellchecking]
|
||||||
|
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt
|
||||||
|
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_dict.pkl
|
||||||
|
|
||||||
lemmas = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmatization-de.txt
|
|
||||||
|
[lemmatization]
|
||||||
|
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt
|
||||||
|
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl
|
||||||
|
|
||||||
|
|
||||||
|
[nouns]
|
||||||
|
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt
|
||||||
|
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt
|
||||||
|
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl
|
||||||
|
|
||||||
|
|
||||||
|
[firstnames]
|
||||||
|
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt
|
||||||
|
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl
|
||||||
|
|
||||||
|
|
||||||
|
[de_stopwords]
|
||||||
|
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt
|
||||||
|
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt
|
||||||
|
input3 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt
|
||||||
|
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl
|
||||||
|
|
||||||
|
|
||||||
|
[logging]
|
||||||
|
level = INFO
|
||||||
|
filename = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
|
||||||
|
|
||||||
|
|
||||||
|
[de_corpus]
|
||||||
|
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv
|
||||||
|
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
|
||||||
|
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv
|
||||||
|
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv
|
||||||
|
|
||||||
|
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
|
||||||
|
raw = de_raw_ticket
|
||||||
|
pre = de_pre_ticket
|
||||||
|
|
||||||
|
|
||||||
|
[en_corpus]
|
||||||
|
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv
|
||||||
|
|
||||||
|
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
|
||||||
|
raw = en_raw_ticket
|
||||||
|
pre = en_pre_ticket
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
[tickets]
|
||||||
|
content_collumn_name = Description
|
||||||
|
metaliste = TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
language = de
|
|
||||||
|
|
||||||
[preprocessing]
|
[preprocessing]
|
||||||
|
|
||||||
|
|
197
corporization.py
197
corporization.py
|
@ -1,6 +1,35 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
import logging
|
||||||
|
from stop_words import get_stop_words
|
||||||
|
|
||||||
|
#import words as words
|
||||||
|
from nltk.corpus import stopwords as nltk_stopwords
|
||||||
|
from collections import Counter
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from scipy import *
|
||||||
|
import sys
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
import pickle
|
||||||
|
import configparser as ConfigParser
|
||||||
|
from miscellaneous import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
@ -17,87 +46,15 @@ import sys
|
||||||
csv.field_size_limit(sys.maxsize)
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
|
||||||
|
|
||||||
|
# load config
|
||||||
|
|
||||||
|
|
||||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
|
|
||||||
|
|
||||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
|
||||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
|
||||||
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
|
|
||||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
|
||||||
|
|
||||||
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
|
|
||||||
|
|
||||||
|
|
||||||
content_collumn_name = "Description"
|
|
||||||
|
|
||||||
metaliste = [
|
|
||||||
"TicketNumber",
|
|
||||||
"Subject",
|
|
||||||
"CreatedDate",
|
|
||||||
"categoryName",
|
|
||||||
"Impact",
|
|
||||||
"Urgency",
|
|
||||||
"BenutzerID",
|
|
||||||
"VerantwortlicherID",
|
|
||||||
"EigentuemerID",
|
|
||||||
"Solution"
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
|
||||||
corpus_name = "de_raw_ticketCorpus"
|
|
||||||
|
|
||||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# todo configuration file
|
|
||||||
"""
|
|
||||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||||
|
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
with open(config_ini) as f:
|
with open(config_ini) as f:
|
||||||
config.read_file(f)
|
config.read_file(f)
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# config logging
|
|
||||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
|
||||||
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def printlog(string, level="INFO"):
|
|
||||||
"""log and prints"""
|
|
||||||
print(string)
|
|
||||||
if level == "INFO":
|
|
||||||
logging.info(string)
|
|
||||||
elif level == "DEBUG":
|
|
||||||
logging.debug(string)
|
|
||||||
elif level == "WARNING":
|
|
||||||
logging.warning(string)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def printRandomDoc(textacyCorpus):
|
|
||||||
import random
|
|
||||||
print()
|
|
||||||
|
|
||||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
|
||||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
|
||||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
|
||||||
textacyCorpus[randIndex].metadata))
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
|
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
|
||||||
|
@ -146,75 +103,93 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
|
||||||
yield metadata
|
yield metadata
|
||||||
|
|
||||||
|
|
||||||
def save_corpus(corpus, corpus_path, corpus_name, parser):
|
|
||||||
"""
|
|
||||||
# save stringstore
|
|
||||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
|
||||||
with open(stringstore_path, "w") as file:
|
|
||||||
parser.vocab.strings.dump(file)
|
|
||||||
|
|
||||||
#todo save vocab?
|
|
||||||
"""
|
|
||||||
|
|
||||||
# save parser
|
|
||||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
|
||||||
parser.save_to_directory(parserpath)
|
|
||||||
|
|
||||||
# save content
|
|
||||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
|
||||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
|
||||||
|
|
||||||
# save meta
|
|
||||||
metapath = corpus_path + corpus_name + "_meta.json"
|
|
||||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##################################################################################################
|
##################################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
content_collumn_name = "Description"
|
||||||
|
metaliste = [
|
||||||
|
"TicketNumber",
|
||||||
|
"Subject",
|
||||||
|
"CreatedDate",
|
||||||
|
"categoryName",
|
||||||
|
"Impact",
|
||||||
|
"Urgency",
|
||||||
|
"BenutzerID",
|
||||||
|
"VerantwortlicherID",
|
||||||
|
"EigentuemerID",
|
||||||
|
"Solution"
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
content_collumn_name = config.get("tickets","content_collumn_name")
|
||||||
|
metaliste = config.get("tickets","metaliste")
|
||||||
|
|
||||||
|
|
||||||
|
path2de_csv = config.get("de_corpus","input")
|
||||||
|
corpus_de_path = config.get("de_corpus", "path")
|
||||||
|
raw_de_name = config.get("de_corpus", "raw")
|
||||||
|
|
||||||
|
|
||||||
|
path2en_csv = config.get("en_corpus","input")
|
||||||
|
corpus_en_path = config.get("en_corpus", "path")
|
||||||
|
raw_en_name = config.get("en_corpus", "raw")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
start = time.time()
|
||||||
printlog("Corporization: {0}".format(datetime.now()))
|
printlog("Corporization: {0}".format(datetime.now()))
|
||||||
|
|
||||||
|
|
||||||
|
#print paths
|
||||||
path_csv_split = path2de_csv.split("/")
|
path_csv_split = path2de_csv.split("/")
|
||||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
printlog(path_csv_split[len(path_csv_split) - 1])
|
||||||
path_csv_split = path2en_csv.split("/")
|
path_csv_split = path2en_csv.split("/")
|
||||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
printlog(path_csv_split[len(path_csv_split) - 1])
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
|
|
||||||
DE_PARSER = spacy.load("de")
|
DE_PARSER = spacy.load("de")
|
||||||
EN_PARSER = spacy.load("en")
|
EN_PARSER = spacy.load("en")
|
||||||
|
|
||||||
de_corpus = textacy.Corpus(DE_PARSER)
|
raw_de_corpus = textacy.Corpus(DE_PARSER)
|
||||||
en_corpus = textacy.Corpus(EN_PARSER)
|
raw_en_corpus = textacy.Corpus(EN_PARSER)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## add files to textacy-corpus,
|
## add files to textacy-corpi,
|
||||||
printlog("Add texts to textacy-corpus")
|
printlog("Add texts to textacy-corpi")
|
||||||
|
|
||||||
de_corpus.add_texts(
|
raw_de_corpus.add_texts(
|
||||||
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
|
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
|
||||||
ticket_csv_to_DictStream(path2de_csv, metaliste)
|
ticket_csv_to_DictStream(path2de_csv, metaliste)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
raw_en_corpus.add_texts(
|
||||||
# leere docs aus corpus kicken
|
ticketcsv_to_textStream(path2en_csv, content_collumn_name),
|
||||||
de_corpus.remove(lambda doc: len(doc) == 0)
|
ticket_csv_to_DictStream(path2en_csv, metaliste)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
for i in range(20):
|
# leere docs aus corpi kicken
|
||||||
printRandomDoc(de_corpus)
|
raw_de_corpus.remove(lambda doc: len(doc) == 0)
|
||||||
|
raw_en_corpus.remove(lambda doc: len(doc) == 0)
|
||||||
|
|
||||||
|
|
||||||
#save corpus
|
#for i in range(20):
|
||||||
|
# printRandomDoc(raw_de_corpus)
|
||||||
|
# printRandomDoc(raw_en_corpus)
|
||||||
|
|
||||||
save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name,parser=DE_PARSER)
|
|
||||||
|
|
||||||
#todo das selbe mit en_corpus
|
#save corpi
|
||||||
|
save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name)
|
||||||
|
save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -1,9 +0,0 @@
|
||||||
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
|
|
||||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"subject"}
|
|
||||||
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
|
|
||||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"frau hinrichs überdenkt die situation und macht dann neue anträge . dieses ticket wird geschlossen"}
|
|
||||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"faxnummer 3166 wurde unter die telefonnummer 7179 im elektronischen telefonbuch eingetragen"}
|
|
||||||
{"categoryName":"lan","Subject":"defekte netzwerkdose frage zu vpn","Solution":"hallo herr rauner , die netzwerkdose weist z. z. keine verbindungsprobleme auf . falls doch welche bestehen , melden sie sich bitte bei uns . mit freunldichen grüßen aicha oikrim"}
|
|
||||||
{"categoryName":"betrieb","Subject":"sso login via browser mit zertifikat","Solution":"der login via zertifikat am sso - dienst mittels firefox und unicard sollte funktionieren . eventuell wurden durch ein browserupdate die einstellungen gelöscht . bitte prüfen sie ob die ca - zertifikate installiert sind : https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" und ob das kryptographie modul im firefox hinterlegt ist : https://service.tu-dortmund.de/group/intra/authentifizierung"}
|
|
||||||
{"categoryName":"elektronisches telefonbuch","Subject":"telephone contract","Solution":"erledigt"}
|
|
||||||
{"categoryName":"verwaltung","Subject":"laptop macht komische geräusche","Solution":"herr alexev swetlomier ( hiwi ) küümert sich bereits um das laptop und frau herbst weiß auch bescheid die zur zeit im urlaub ist"}
|
|
File diff suppressed because one or more lines are too long
339
init.py
339
init.py
|
@ -4,6 +4,9 @@ from datetime import datetime
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
from stop_words import get_stop_words
|
||||||
|
|
||||||
|
#import words as words
|
||||||
from nltk.corpus import stopwords as nltk_stopwords
|
from nltk.corpus import stopwords as nltk_stopwords
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import csv
|
import csv
|
||||||
|
@ -15,58 +18,35 @@ from scipy import *
|
||||||
import sys
|
import sys
|
||||||
csv.field_size_limit(sys.maxsize)
|
csv.field_size_limit(sys.maxsize)
|
||||||
import pickle
|
import pickle
|
||||||
|
import configparser as ConfigParser
|
||||||
|
from miscellaneous import *
|
||||||
|
|
||||||
|
|
||||||
# todo configuration file ?
|
# load config
|
||||||
"""
|
|
||||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||||
|
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
with open(config_ini) as f:
|
with open(config_ini) as f:
|
||||||
config.read_file(f)
|
config.read_file(f)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def create_lemma_dict(path2lemmalist):
|
||||||
"""
|
"""
|
||||||
|
Creates a dict out of a file a la:
|
||||||
|
|
||||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"
|
l1 w1
|
||||||
|
l1 w2
|
||||||
|
l2 w1
|
||||||
|
l2 w2
|
||||||
|
|
||||||
|
Result will be used as lemma_dict["word"] --> lemma
|
||||||
|
|
||||||
# config logging
|
:param path2lemmalist: str
|
||||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
:return: dictionary
|
||||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
"""
|
||||||
|
lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(
|
||||||
|
textacy.fileio.read_file_lines(path2lemmalist))))
|
||||||
|
|
||||||
|
|
||||||
DE_PARSER = spacy.load("de")
|
|
||||||
EN_PARSER = spacy.load("en")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def replaceRockDots():
|
|
||||||
return lambda string: re.sub(r'[ß]', "ss",
|
|
||||||
(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
|
||||||
|
|
||||||
def printlog(string, level="INFO"):
|
|
||||||
"""log and prints"""
|
|
||||||
print(string)
|
|
||||||
if level == "INFO":
|
|
||||||
logging.info(string)
|
|
||||||
elif level == "DEBUG":
|
|
||||||
logging.debug(string)
|
|
||||||
elif level == "WARNING":
|
|
||||||
logging.warning(string)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def save_obj(obj, path):
|
|
||||||
with open(path + '.pkl', 'wb') as f:
|
|
||||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
def load_obj(path ):
|
|
||||||
with open(path + '.pkl', 'rb') as f:
|
|
||||||
return pickle.load(f)
|
|
||||||
|
|
||||||
def create_lemma_dict(lemmalist):
|
|
||||||
|
|
||||||
lemma_dict = {}
|
lemma_dict = {}
|
||||||
|
|
||||||
|
@ -81,69 +61,22 @@ def create_lemma_dict(lemmalist):
|
||||||
|
|
||||||
return lemma_dict
|
return lemma_dict
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def build_thesaurus_dict(path2wordnet,returnall=False):
|
||||||
"""
|
"""
|
||||||
def build_thesaurus(path2lexicalentries, path2synsets):
|
Creates a dict out of the deWordNet
|
||||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
|
||||||
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
|
||||||
|
Result will be used as lemma_dict["word"] --> lemma
|
||||||
|
|
||||||
|
:param path2lexicalentries: str
|
||||||
|
:param returnall: bool if True, also return , word2synsets, synset2Words
|
||||||
|
:return: dictionaries: thesaurus
|
||||||
|
"""
|
||||||
|
lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8"))
|
||||||
|
|
||||||
lexroot = lextree.getroot()
|
lexroot = lextree.getroot()
|
||||||
synroot = syntree.getroot()
|
|
||||||
|
|
||||||
thesaurus = []
|
|
||||||
|
|
||||||
for r in synroot:
|
|
||||||
for element in r:
|
|
||||||
|
|
||||||
if element.tag == "Synset":
|
|
||||||
sysnet = []
|
|
||||||
attrib = element.attrib
|
|
||||||
id = attrib["id"]
|
|
||||||
|
|
||||||
for ro in lexroot:
|
|
||||||
for elem in ro:
|
|
||||||
if elem.tag == "LexicalEntry":
|
|
||||||
subs_dicts = [subentry.attrib for subentry in elem]
|
|
||||||
# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
|
|
||||||
|
|
||||||
dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict
|
|
||||||
if "synset" in dic.keys():
|
|
||||||
if dic["synset"] == id:
|
|
||||||
string = (dic["writtenForm"])
|
|
||||||
|
|
||||||
# replaceRockDots
|
|
||||||
string = re.sub(r'[ß]', "ss", string)
|
|
||||||
string = re.sub(r'[ö]', "oe", string)
|
|
||||||
string = re.sub(r'[ü]', "ue", string)
|
|
||||||
string = re.sub(r'[ä]', "ae", string)
|
|
||||||
|
|
||||||
# alle punkte raus
|
|
||||||
string = re.sub(r'[.]', "", string)
|
|
||||||
|
|
||||||
# alles in klammern raus
|
|
||||||
string = re.sub(r"\((.*)\)", " ", string)
|
|
||||||
|
|
||||||
# längeres leerzeichen normalisieren
|
|
||||||
string = textacy.preprocess.normalize_whitespace(string)
|
|
||||||
|
|
||||||
sysnet.append(string.lower().strip())
|
|
||||||
|
|
||||||
# nach anzhal der wörter in den strings sortieren
|
|
||||||
sysnet.sort(key=lambda x: len(x.split()))
|
|
||||||
if len(sysnet) != 0:
|
|
||||||
# todo warum sind manche leer?
|
|
||||||
thesaurus.append(sysnet)
|
|
||||||
return thesaurus
|
|
||||||
|
|
||||||
#todo thesaurus in dictionary
|
|
||||||
"""
|
|
||||||
|
|
||||||
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|
||||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
|
||||||
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
|
||||||
|
|
||||||
lexroot = lextree.getroot()
|
|
||||||
#synroot = syntree.getroot()
|
|
||||||
|
|
||||||
|
|
||||||
word2synsets = {}
|
word2synsets = {}
|
||||||
template = {"w1": ["s1", "s2"]}
|
template = {"w1": ["s1", "s2"]}
|
||||||
|
@ -167,6 +100,9 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||||
if 'writtenForm' in lex_dict.keys():
|
if 'writtenForm' in lex_dict.keys():
|
||||||
string = (lex_dict["writtenForm"])
|
string = (lex_dict["writtenForm"])
|
||||||
|
|
||||||
|
if string == "Kennwort":
|
||||||
|
pass
|
||||||
|
|
||||||
# replaceRockDots
|
# replaceRockDots
|
||||||
string = re.sub(r'[ß]', "ss", string)
|
string = re.sub(r'[ß]', "ss", string)
|
||||||
string = re.sub(r'[ö]', "oe", string)
|
string = re.sub(r'[ö]', "oe", string)
|
||||||
|
@ -186,10 +122,12 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||||
|
|
||||||
word2synsets[string] = synlist
|
word2synsets[string] = synlist
|
||||||
|
|
||||||
|
|
||||||
synset2Words = {}
|
synset2Words = {}
|
||||||
template = {"s1": ["w1","w2"]}
|
template = {"s1": ["w1","w2"]}
|
||||||
|
|
||||||
for word,synset in word2synsets.items():
|
for word,synset in word2synsets.items():
|
||||||
|
if word != '':
|
||||||
for syn in synset:
|
for syn in synset:
|
||||||
if syn not in synset2Words.keys():
|
if syn not in synset2Words.keys():
|
||||||
synset2Words[syn] = [word]
|
synset2Words[syn] = [word]
|
||||||
|
@ -203,91 +141,135 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||||
thesaurus = {}
|
thesaurus = {}
|
||||||
thesaurus_template = {"w1" : "mainsyn"}
|
thesaurus_template = {"w1" : "mainsyn"}
|
||||||
|
|
||||||
|
|
||||||
for word,synset in word2synsets.items():
|
for word,synset in word2synsets.items():
|
||||||
try:
|
try:
|
||||||
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen?
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if returnall:
|
||||||
|
return thesaurus, word2synsets, synset2Words
|
||||||
|
else:
|
||||||
return thesaurus
|
return thesaurus
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def create_stopword_lists(*paths):
|
||||||
"""
|
"""
|
||||||
for r in synroot:
|
creates a list of stoppwords from:
|
||||||
for element in r:
|
spacy
|
||||||
|
nltk
|
||||||
|
stop_words
|
||||||
|
|
||||||
if element.tag == "Synset":
|
:param paths: list of additional filepaths where each file looks like
|
||||||
synset = []
|
w1
|
||||||
attrib = element.attrib
|
w2
|
||||||
id = attrib["id"]
|
w3
|
||||||
|
filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt
|
||||||
|
|
||||||
if id not in synset2Words.keys():
|
:return: lists: de_stopwords, en_stopwords
|
||||||
synset2Words[id] = "WORD"
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
## GERMAN
|
||||||
|
|
||||||
|
# from packages
|
||||||
|
de_stop_words1 = list(get_stop_words("de"))
|
||||||
|
|
||||||
|
de_stop_words2 = list(nltk_stopwords.words('german'))
|
||||||
|
|
||||||
|
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
|
||||||
|
|
||||||
|
#from files
|
||||||
|
de_filepaths = []
|
||||||
|
for path in paths:
|
||||||
|
if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[
|
||||||
|
1] == 'stopwords':
|
||||||
|
de_filepaths.append(path)
|
||||||
|
|
||||||
|
|
||||||
def create_stopwordlist():
|
de_stop_words4 = list_from_files(*de_filepaths)
|
||||||
|
|
||||||
de_stop_words1 = list(map(replaceRockDots(),
|
#combine everything
|
||||||
list(
|
de_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
|
||||||
map(textacy.preprocess.normalize_whitespace,
|
de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
|
||||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german')))))
|
|
||||||
|
|
||||||
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
|
|
||||||
|
|
||||||
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))
|
|
||||||
|
|
||||||
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
|
|
||||||
|
|
||||||
return de_stop_words
|
|
||||||
|
|
||||||
#todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english'))))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## ENGLISH
|
||||||
|
|
||||||
|
# from packages
|
||||||
|
en_stop_words1 = list(get_stop_words("en"))
|
||||||
|
|
||||||
|
en_stop_words2 = list(nltk_stopwords.words('english'))
|
||||||
|
|
||||||
|
en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS)
|
||||||
|
|
||||||
|
# from files
|
||||||
|
en_filepaths = [path for path in paths if
|
||||||
|
os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[
|
||||||
|
1] == 'stopwords']
|
||||||
|
|
||||||
|
en_stop_words4 = list_from_files(*en_filepaths)
|
||||||
|
|
||||||
|
|
||||||
########################## Spellchecking ##########################################
|
# combine everything
|
||||||
# http://norvig.com/spell-correct.html
|
en_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
|
||||||
# http://wortschatz.uni-leipzig.de/en/download
|
en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))
|
||||||
|
|
||||||
|
|
||||||
|
return de_stop_words, en_stop_words
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def build_words_for_spellchecking(path2words):
|
||||||
|
"""
|
||||||
|
create word-Counter for spellchecking
|
||||||
|
|
||||||
|
http://norvig.com/spell-correct.html
|
||||||
|
http://wortschatz.uni-leipzig.de/en/download
|
||||||
|
|
||||||
|
http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz
|
||||||
|
:return: Counter
|
||||||
|
"""
|
||||||
def words(text): return re.findall(r'\w+', text.lower())
|
def words(text): return re.findall(r'\w+', text.lower())
|
||||||
|
|
||||||
|
return Counter(words(open(path2words).read()))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##################################################################################################
|
##################################################################################################
|
||||||
|
|
||||||
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
|
|
||||||
# außerdem saubere stoppwortliste und nomenliste
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# THESAURUS
|
# THESAURUS
|
||||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
path2wordnet = config.get("thesaurus","input")
|
||||||
#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
||||||
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# SPELLCHECKING
|
# SPELLCHECKING
|
||||||
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
|
path2words_file = config.get("spellchecking","input")
|
||||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
path2wordlist = config.get("spellchecking","pickle_file")
|
||||||
|
|
||||||
|
|
||||||
|
# LEMMA
|
||||||
|
path2lemma_file = config.get("lemmatization","input")
|
||||||
|
path2lemmadict = config.get("lemmatization","pickle_file")
|
||||||
|
|
||||||
|
# NOMEN
|
||||||
|
nouns1 = config.get("nouns","input1")
|
||||||
|
nouns2 = config.get("nouns","input2")
|
||||||
|
path2nouns_list = config.get("nouns","pickle_file")
|
||||||
|
|
||||||
|
|
||||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
# VORNAMEN
|
||||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
firstnames_txt = config.get("firstnames","input")
|
||||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
path2firstnameslist = config.get("firstnames","pickle_file")
|
||||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# STOPWORDS
|
||||||
|
stop1 = config.get("de_stopwords","input1")
|
||||||
|
stop2 = config.get("de_stopwords","input2")
|
||||||
|
stop3 = config.get("de_stopwords","input3")
|
||||||
|
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -297,71 +279,42 @@ def main():
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
printlog("create and save lemma_dict")
|
printlog("create and save lemma_dict")
|
||||||
LEMMAS = list(
|
lemma_dict = create_lemma_dict(path2lemma_file)
|
||||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
|
|
||||||
|
|
||||||
lemma_dict = create_lemma_dict(LEMMAS)
|
|
||||||
save_obj(lemma_dict, path2lemmadict)
|
save_obj(lemma_dict, path2lemmadict)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
printlog("Build and save Wordlist for Spellchecking")
|
printlog("Build and save Wordlist for Spellchecking")
|
||||||
WORDS = Counter(words(open(path2words).read()))
|
words = build_words_for_spellchecking(path2words_file)
|
||||||
save_obj(WORDS, path2wordlist)
|
save_obj(words, path2wordlist)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
printlog("Build and save Thesaurus")
|
printlog("Build and save Thesaurus")
|
||||||
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)
|
thesaurus = build_thesaurus_dict(path2wordnet)
|
||||||
|
save_obj(thesaurus, path2thesaurus_dict)
|
||||||
|
|
||||||
save_obj(THESAURUS, path2thesaurusdict)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
printlog("Build and save stoppwortliste")
|
printlog("Build and save stoppwortliste")
|
||||||
de_stop_words = create_stopwordlist()
|
de_stop_words = create_stopword_lists(stop1, stop2, stop3)
|
||||||
save_obj(de_stop_words, path2stopwordlist)
|
save_obj(de_stop_words, path2stopwordlist)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
printlog("Build and save nomenliste")
|
printlog("Build and save nomenliste")
|
||||||
NOUNS = list(textacy.fileio.read_file_lines(
|
nouns = list_from_files(nouns1,nouns2)
|
||||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(
|
save_obj(nouns, path2nouns_list)
|
||||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
|
|
||||||
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
|
|
||||||
save_obj(NOUNS, path2NOUNSlist)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
printlog("Build and save fistnameslist")
|
|
||||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
|
||||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
|
|
||||||
|
|
||||||
save_obj(VORNAMEN, path2firstnameslist)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
printlog("Build and save firstnameslist")
|
||||||
|
vornamen = list_from_files(firstnames_txt)
|
||||||
|
save_obj(vornamen, path2firstnameslist)
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
import init
|
||||||
|
import corporization
|
||||||
|
import preprocessing
|
||||||
|
from miscellaneous import *
|
||||||
|
|
||||||
|
|
||||||
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
||||||
|
|
||||||
|
|
||||||
|
init.main()
|
||||||
|
printlog("")
|
||||||
|
|
||||||
|
corporization.main()
|
||||||
|
printlog("")
|
||||||
|
|
||||||
|
preprocessing.main()
|
||||||
|
printlog("")
|
||||||
|
|
|
@ -0,0 +1,281 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import random
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
import logging
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
import csv
|
||||||
|
import functools
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from scipy import *
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
import time
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
import csv
|
||||||
|
import functools
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from scipy import *
|
||||||
|
import sys
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
import enchant
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import functools
|
||||||
|
import os.path
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import sys
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from scipy import *
|
||||||
|
from textacy import Vectorizer
|
||||||
|
import warnings
|
||||||
|
import configparser as ConfigParser
|
||||||
|
import sys
|
||||||
|
import hunspell
|
||||||
|
from postal.parser import parse_address
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from nltk.corpus import stopwords as nltk_stopwords
|
||||||
|
from collections import Counter
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from scipy import *
|
||||||
|
import sys
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# load config
|
||||||
|
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||||
|
|
||||||
|
config = ConfigParser.ConfigParser()
|
||||||
|
with open(config_ini) as f:
|
||||||
|
config.read_file(f)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# config logging
|
||||||
|
filename = config.get("logging","filename")
|
||||||
|
level = config.get("logging","level")
|
||||||
|
if level == "INFO":
|
||||||
|
level = logging.INFO
|
||||||
|
elif level == "DEBUG":
|
||||||
|
level = logging.DEBUG
|
||||||
|
elif level == "WARNING":
|
||||||
|
level = logging.WARNING
|
||||||
|
logging.basicConfig(filename=filename, level=level)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def printlog(string, level="INFO"):
|
||||||
|
"""log and prints"""
|
||||||
|
print(string)
|
||||||
|
if level == "INFO":
|
||||||
|
logging.info(string)
|
||||||
|
elif level == "DEBUG":
|
||||||
|
logging.debug(string)
|
||||||
|
elif level == "WARNING":
|
||||||
|
logging.warning(string)
|
||||||
|
|
||||||
|
|
||||||
|
def compose(*functions):
|
||||||
|
def compose2(f, g):
|
||||||
|
return lambda x: f(g(x))
|
||||||
|
|
||||||
|
return functools.reduce(compose2, functions, lambda x: x)
|
||||||
|
|
||||||
|
|
||||||
|
def get_calling_function():
|
||||||
|
"""finds the calling function in many decent cases.
|
||||||
|
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
|
||||||
|
"""
|
||||||
|
fr = sys._getframe(1) # inspect.stack()[1][0]
|
||||||
|
co = fr.f_code
|
||||||
|
for get in (
|
||||||
|
lambda: fr.f_globals[co.co_name],
|
||||||
|
lambda: getattr(fr.f_locals['self'], co.co_name),
|
||||||
|
lambda: getattr(fr.f_locals['cls'], co.co_name),
|
||||||
|
lambda: fr.f_back.f_locals[co.co_name], # nested
|
||||||
|
lambda: fr.f_back.f_locals['func'], # decorators
|
||||||
|
lambda: fr.f_back.f_locals['meth'],
|
||||||
|
lambda: fr.f_back.f_locals['f'],
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
func = get()
|
||||||
|
except (KeyError, AttributeError):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if func.__code__ == co:
|
||||||
|
return func
|
||||||
|
raise AttributeError("func not found")
|
||||||
|
|
||||||
|
|
||||||
|
def save_obj(obj, path):
|
||||||
|
with open(path , 'wb') as f:
|
||||||
|
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
def load_obj(path):
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
return pickle.load(f)
|
||||||
|
|
||||||
|
def replaceRockDots():
|
||||||
|
return lambda string: re.sub(r'[ß]', "ss",
|
||||||
|
(re.sub(r'[ö]', "oe",
|
||||||
|
(re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||||
|
|
||||||
|
def list_from_files(*paths):
|
||||||
|
"""
|
||||||
|
create string-list from file like
|
||||||
|
n1
|
||||||
|
n2
|
||||||
|
n3
|
||||||
|
|
||||||
|
:param paths: list(str) or str if single path
|
||||||
|
:return: list(str)
|
||||||
|
"""
|
||||||
|
|
||||||
|
listlist = []
|
||||||
|
for path in paths:
|
||||||
|
listlist.append(list(textacy.fileio.read_file_lines(path)))
|
||||||
|
|
||||||
|
#liste von listen zu einer liste
|
||||||
|
liste = [item for sublist in listlist for item in sublist]
|
||||||
|
|
||||||
|
return list(map(textacy.preprocess.normalize_whitespace, liste))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def printRandomDoc(textacyCorpus):
|
||||||
|
"""
|
||||||
|
printlogss random doc out of a textacy-Corpus
|
||||||
|
:param textacyCorpus:
|
||||||
|
"""
|
||||||
|
print()
|
||||||
|
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||||
|
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||||
|
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||||
|
textacyCorpus[randIndex].metadata))
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def save_corpus(corpus, corpus_path, corpus_name):
|
||||||
|
"""
|
||||||
|
saves a textacy-corpus including spacy-parser
|
||||||
|
:param corpus: textacy-Corpus
|
||||||
|
:param corpus_path: str
|
||||||
|
:param corpus_name: str (should content the language like "_de_")
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
# save stringstore
|
||||||
|
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||||
|
with open(stringstore_path, "w") as file:
|
||||||
|
parser.vocab.strings.dump(file)
|
||||||
|
|
||||||
|
#todo save vocab?
|
||||||
|
"""
|
||||||
|
|
||||||
|
# save parser
|
||||||
|
parser = corpus.spacy_lang
|
||||||
|
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||||
|
parser.save_to_directory(parserpath)
|
||||||
|
|
||||||
|
# save content
|
||||||
|
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||||
|
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
||||||
|
|
||||||
|
# save meta
|
||||||
|
metapath = corpus_path + corpus_name + "_meta.json"
|
||||||
|
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
|
"""
|
||||||
|
Load textacy-Corpus including spacy-parser out from file
|
||||||
|
:param corpus_path: str
|
||||||
|
:param corpus_name: str (should content the language like "_de_")
|
||||||
|
:param lang: str language code)
|
||||||
|
:return: texracy.Corpus, spacy.language
|
||||||
|
"""
|
||||||
|
|
||||||
|
#ckeck for language
|
||||||
|
if "_de_" in corpus_name:
|
||||||
|
lang="de"
|
||||||
|
elif "_en_" in corpus_name:
|
||||||
|
lang ="en"
|
||||||
|
|
||||||
|
|
||||||
|
# load parser
|
||||||
|
parser = spacy.load(lang)
|
||||||
|
|
||||||
|
|
||||||
|
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
||||||
|
with open(stringstorepath) as file:
|
||||||
|
parser.vocab.strings.load(file)
|
||||||
|
|
||||||
|
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
|
||||||
|
parser.vocab.load_lexemes(vocabpath)
|
||||||
|
|
||||||
|
#load corpus
|
||||||
|
corpus = textacy.Corpus(parser)
|
||||||
|
|
||||||
|
|
||||||
|
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||||
|
metapath = corpus_path + corpus_name + "_meta.json"
|
||||||
|
|
||||||
|
|
||||||
|
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||||
|
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||||
|
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||||
|
corpus.add_doc(
|
||||||
|
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||||
|
return corpus, corpus.spacy_lang
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -420,10 +420,10 @@ custom_words = ["grüßen", "fragen"]
|
||||||
####################'####################'####################'####################'####################'##############
|
####################'####################'####################'####################'####################'##############
|
||||||
|
|
||||||
|
|
||||||
## files to textacy-corpus
|
## files to textacy-corpi
|
||||||
textacyCorpus = textacy.Corpus(PARSER)
|
textacyCorpus = textacy.Corpus(PARSER)
|
||||||
|
|
||||||
print("add texts to textacy-corpus...")
|
print("add texts to textacy-corpi...")
|
||||||
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
|
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -182,8 +182,8 @@ cleanStream = compose(
|
||||||
cleanEnt
|
cleanEnt
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpus
|
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpi
|
||||||
# metadata:xml -> -> stringCleaning -> corpus
|
# metadata:xml -> -> stringCleaning -> corpi
|
||||||
|
|
||||||
corpus = textacy.Corpus(PARSER)
|
corpus = textacy.Corpus(PARSER)
|
||||||
|
|
||||||
|
|
755
preprocessing.py
755
preprocessing.py
|
@ -2,27 +2,53 @@
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
print(datetime.now())
|
print(datetime.now())
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from stop_words import get_stop_words
|
||||||
|
|
||||||
|
#import words as words
|
||||||
|
from nltk.corpus import stopwords as nltk_stopwords
|
||||||
|
from collections import Counter
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from scipy import *
|
||||||
|
import sys
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
import pickle
|
||||||
|
import configparser as ConfigParser
|
||||||
|
from miscellaneous import *
|
||||||
|
|
||||||
|
|
||||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
|
||||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
|
||||||
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
|
|
||||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
|
||||||
|
|
||||||
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
|
|
||||||
|
|
||||||
|
|
||||||
#idee roh-corpus (nur whitespace weg) speichern -> pregeprocesster corpus -> damit arbeiten
|
|
||||||
|
|
||||||
|
|
||||||
path_csv_split = path2de_csv.split("/")
|
|
||||||
print(path_csv_split[len(path_csv_split) - 1])
|
|
||||||
path_csv_split = path2en_csv.split("/")
|
|
||||||
print(path_csv_split[len(path_csv_split) - 1])
|
|
||||||
|
|
||||||
|
|
||||||
import time
|
import time
|
||||||
start = time.time()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
import logging
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
import csv
|
||||||
|
import functools
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from scipy import *
|
||||||
|
import sys
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
|
@ -40,231 +66,29 @@ csv.field_size_limit(sys.maxsize)
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
def save_obj(obj, path):
|
|
||||||
with open(path + '.pkl', 'wb') as f:
|
|
||||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
def load_obj(path ):
|
|
||||||
with open(path + '.pkl', 'rb') as f:
|
|
||||||
return pickle.load(f)
|
|
||||||
|
|
||||||
|
|
||||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
|
||||||
|
|
||||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
|
||||||
metapath = corpus_path + corpus_name + "_meta.json"
|
|
||||||
|
|
||||||
#load parser
|
|
||||||
parserpath = corpus_path + str(lang) + '_parser'
|
|
||||||
parser = spacy.load(parserpath)
|
|
||||||
|
|
||||||
corpus = textacy.Corpus(parser)
|
|
||||||
|
|
||||||
|
|
||||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
|
||||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
|
||||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
|
||||||
corpus.add_doc(
|
|
||||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
|
||||||
return corpus
|
|
||||||
|
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
|
||||||
corpus_name = "de_raw_ticketCorpus"
|
|
||||||
|
|
||||||
print(load_corpus(corpus_path,corpus_name))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# load config
|
||||||
|
|
||||||
|
|
||||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# todo configuration file ?
|
|
||||||
"""
|
|
||||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||||
|
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
with open(config_ini) as f:
|
with open(config_ini) as f:
|
||||||
config.read_file(f)
|
config.read_file(f)
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# config logging
|
|
||||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
|
||||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
|
||||||
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# THESAURUS
|
REGEX_SPECIALCHAR = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
||||||
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
||||||
THESAURUS = load_obj(path2thesaurusdict)
|
|
||||||
|
|
||||||
|
|
||||||
# SPELLCHECKING
|
|
||||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
|
||||||
|
|
||||||
|
|
||||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
|
||||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
|
||||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
|
||||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# SPELLCHECKING
|
|
||||||
|
|
||||||
|
|
||||||
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
|
|
||||||
|
|
||||||
DE_PARSER = spacy.load("de")
|
|
||||||
EN_PARSER = spacy.load("en")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
|
||||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))
|
|
||||||
|
|
||||||
en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(stopwords.words('english'))))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
LEMMAS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
|
|
||||||
|
|
||||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
|
||||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
|
|
||||||
|
|
||||||
NOUNS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
|
|
||||||
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
print(de_stop_words[10:30])
|
|
||||||
print(LEMMAS[10:30])
|
|
||||||
print(VORNAMEN[10:30])
|
|
||||||
print(NOUNS[10:30])
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
|
||||||
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
|
||||||
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
|
||||||
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
|
||||||
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
|
||||||
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def printlog(string, level="INFO"):
|
|
||||||
"""log and prints"""
|
|
||||||
print(string)
|
|
||||||
if level == "INFO":
|
|
||||||
logging.info(string)
|
|
||||||
elif level == "DEBUG":
|
|
||||||
logging.debug(string)
|
|
||||||
elif level == "WARNING":
|
|
||||||
logging.warning(string)
|
|
||||||
|
|
||||||
printlog("Load functions")
|
|
||||||
|
|
||||||
|
|
||||||
def compose(*functions):
|
|
||||||
def compose2(f, g):
|
|
||||||
return lambda x: f(g(x))
|
|
||||||
|
|
||||||
return functools.reduce(compose2, functions, lambda x: x)
|
|
||||||
|
|
||||||
|
|
||||||
def get_calling_function():
|
|
||||||
"""finds the calling function in many decent cases.
|
|
||||||
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
|
|
||||||
"""
|
|
||||||
fr = sys._getframe(1) # inspect.stack()[1][0]
|
|
||||||
co = fr.f_code
|
|
||||||
for get in (
|
|
||||||
lambda: fr.f_globals[co.co_name],
|
|
||||||
lambda: getattr(fr.f_locals['self'], co.co_name),
|
|
||||||
lambda: getattr(fr.f_locals['cls'], co.co_name),
|
|
||||||
lambda: fr.f_back.f_locals[co.co_name], # nested
|
|
||||||
lambda: fr.f_back.f_locals['func'], # decorators
|
|
||||||
lambda: fr.f_back.f_locals['meth'],
|
|
||||||
lambda: fr.f_back.f_locals['f'],
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
func = get()
|
|
||||||
except (KeyError, AttributeError):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
if func.__code__ == co:
|
|
||||||
return func
|
|
||||||
raise AttributeError("func not found")
|
|
||||||
|
|
||||||
|
|
||||||
def printRandomDoc(textacyCorpus):
|
|
||||||
import random
|
|
||||||
print()
|
|
||||||
|
|
||||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
|
||||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
|
||||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
|
||||||
textacyCorpus[randIndex].metadata))
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
def csv_to_contentStream(path2csv: str, content_collumn_name: str):
|
|
||||||
"""
|
|
||||||
:param path2csv: string
|
|
||||||
:param content_collumn_name: string
|
|
||||||
:return: string-generator
|
|
||||||
"""
|
|
||||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
|
||||||
content_collumn = 0 # standardvalue
|
|
||||||
|
|
||||||
for i, lst in enumerate(stream):
|
|
||||||
if i == 0:
|
|
||||||
# look for desired column
|
|
||||||
for j, col in enumerate(lst):
|
|
||||||
if col == content_collumn_name:
|
|
||||||
content_collumn = j
|
|
||||||
else:
|
|
||||||
yield lst[content_collumn]
|
|
||||||
|
|
||||||
|
|
||||||
def csv_to_metaStream(path2csv: str, metalist: [str]):
|
|
||||||
"""
|
|
||||||
:param path2csv: string
|
|
||||||
:param metalist: list of strings
|
|
||||||
:return: dict-generator
|
|
||||||
"""
|
|
||||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
|
||||||
|
|
||||||
content_collumn = 0 # standardvalue
|
|
||||||
metaindices = []
|
|
||||||
metadata_temp = {}
|
|
||||||
for i, lst in enumerate(stream):
|
|
||||||
if i == 0:
|
|
||||||
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
|
|
||||||
for key in metalist:
|
|
||||||
if key == col:
|
|
||||||
metaindices.append(j)
|
|
||||||
metadata_temp = dict(
|
|
||||||
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
|
|
||||||
|
|
||||||
else:
|
|
||||||
metadata = metadata_temp.copy()
|
|
||||||
for key, value in metadata.items():
|
|
||||||
metadata[key] = lst[value]
|
|
||||||
yield metadata
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
THESAURUS = {}
|
||||||
|
WORDS = {}
|
||||||
|
LEMMAS = {}
|
||||||
|
NOUNS = []
|
||||||
|
VORNAMEN= []
|
||||||
|
de_stop_words=[]
|
||||||
|
|
||||||
############# filter tokens
|
############# filter tokens
|
||||||
|
|
||||||
|
@ -303,14 +127,12 @@ def remove_words_containing_Numbers():
|
||||||
return lambda tok: not bool(re.search('\d', tok.lower_))
|
return lambda tok: not bool(re.search('\d', tok.lower_))
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
def remove_words_containing_topLVL():
|
def remove_words_containing_topLVL():
|
||||||
return lambda tok: not bool(re.search(regex_topLvl, tok.lower_))
|
return lambda tok: not bool(re.search(REGEX_TOPLVL, tok.lower_))
|
||||||
|
|
||||||
|
|
||||||
def remove_words_containing_specialCharacters():
|
def remove_words_containing_specialCharacters():
|
||||||
return lambda tok: not bool(re.search(regex_specialChars, tok.lower_))
|
return lambda tok: not bool(re.search(REGEX_SPECIALCHAR, tok.lower_))
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def remove_long_words():
|
def remove_long_words():
|
||||||
|
@ -327,237 +149,28 @@ def remove_first_names():
|
||||||
|
|
||||||
############# strings
|
############# strings
|
||||||
|
|
||||||
def replaceRockDots():
|
|
||||||
return lambda string: re.sub(r'[ß]', "ss",
|
|
||||||
(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
|
||||||
|
|
||||||
def remove_addresses(string):
|
def remove_addresses(string):
|
||||||
pass # todo
|
pass # todo
|
||||||
|
|
||||||
|
def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
|
||||||
"""
|
|
||||||
def stringcleaning(stringstream, funclist):
|
|
||||||
for string in stringstream:
|
|
||||||
for f in funclist:
|
|
||||||
|
|
||||||
string = f(string)
|
|
||||||
yield string
|
|
||||||
|
|
||||||
def cut_after(word="gruss"):
|
|
||||||
return lambda string: string.rpartition(word)[0] if word in string else string
|
|
||||||
|
|
||||||
def seperate_words_on_regex(regex=regex_specialChars):
|
|
||||||
return lambda string: " ".join(re.compile(regex).split(string))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def remove_words_containing_topLVL():
|
|
||||||
return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
|
|
||||||
|
|
||||||
|
|
||||||
def replaceSpecialChars(replace_with=" "):
|
|
||||||
return lambda string: re.sub(regex_specialChars, replace_with, string.lower())
|
|
||||||
|
|
||||||
|
|
||||||
def replaceNumbers(replace_with="NUMBER"):
|
|
||||||
return lambda string : textacy.preprocess.replace_numbers(string.lower(), replace_with=replace_with)
|
|
||||||
|
|
||||||
|
|
||||||
def replacePhonenumbers(replace_with="PHONENUMBER"):
|
|
||||||
return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
|
|
||||||
|
|
||||||
|
|
||||||
def replaceSharpS(replace_with="ss"):
|
|
||||||
return lambda string: re.sub(r'[ß]',replace_with,string.lower())
|
|
||||||
|
|
||||||
def fixUnicode():
|
|
||||||
return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
|
||||||
"""
|
|
||||||
|
|
||||||
"""
|
|
||||||
def lemmatizeWord(word,filepath=LEMMAS):
|
|
||||||
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
|
|
||||||
if word.lower() == line.split()[1].strip().lower():
|
|
||||||
return line.split()[0].strip().lower()
|
|
||||||
return word.lower() # falls nix gefunden wurde
|
|
||||||
|
|
||||||
|
|
||||||
def create_lemma_dicts(lemmalist=LEMMAS):
|
|
||||||
w_dict = {}
|
|
||||||
lem_dict = {}
|
|
||||||
|
|
||||||
for i, line in enumerate(lemmalist):
|
|
||||||
try:
|
|
||||||
lem_word_pair = line.split()
|
|
||||||
|
|
||||||
if len(lem_word_pair) != 2:
|
|
||||||
print(line)
|
|
||||||
|
|
||||||
lemma = lem_word_pair[0].strip().lower()
|
|
||||||
|
|
||||||
word = lem_word_pair[1].strip().lower()
|
|
||||||
except:
|
|
||||||
print(line)
|
|
||||||
|
|
||||||
if lemma not in lem_dict:
|
|
||||||
lem_dict[lemma] = i
|
|
||||||
|
|
||||||
if word not in w_dict:
|
|
||||||
w_dict[word] = lem_dict[lemma]
|
|
||||||
|
|
||||||
l_dict = {v: k for k, v in lem_dict.items()} # switch key/values
|
|
||||||
|
|
||||||
return l_dict,w_dict
|
|
||||||
|
|
||||||
lemma_dict,word_dict = create_lemma_dicts()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def lemmatizeWord(word,l_dict=lemma_dict,w_dict=word_dict):
|
|
||||||
#mehrmals machen
|
|
||||||
for i in range(3):
|
|
||||||
try:
|
|
||||||
word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower()
|
|
||||||
except:
|
|
||||||
print(word)
|
|
||||||
return word
|
|
||||||
|
|
||||||
def lemmatize():
|
|
||||||
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
|
|
||||||
|
|
||||||
def lemmatize():
|
|
||||||
return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()])
|
|
||||||
|
|
||||||
DE_SPELLCHECKER = enchant.Dict("de_DE")
|
|
||||||
EN_SPELLCHECKER = enchant.Dict("en_US")
|
|
||||||
|
|
||||||
def autocorrectWord(word,spellchecker=DE_SPELLCHECKER):
|
|
||||||
|
|
||||||
try:
|
|
||||||
return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word
|
|
||||||
except:
|
|
||||||
return word
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def autocorrect():
|
|
||||||
return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()])
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def create_lemma_dicts(lemmalist=LEMMAS):
|
|
||||||
w_dict = {}
|
|
||||||
lem_dict = {}
|
|
||||||
|
|
||||||
for i, line in enumerate(lemmalist):
|
|
||||||
try:
|
|
||||||
lem_word_pair = line.split()
|
|
||||||
|
|
||||||
if len(lem_word_pair) != 2:
|
|
||||||
print(line)
|
|
||||||
|
|
||||||
lemma = lem_word_pair[0].strip().lower()
|
|
||||||
|
|
||||||
word = lem_word_pair[1].strip().lower()
|
|
||||||
except:
|
|
||||||
print(line)
|
|
||||||
|
|
||||||
if lemma not in lem_dict:
|
|
||||||
lem_dict[lemma] = i
|
|
||||||
|
|
||||||
if word not in w_dict:
|
|
||||||
w_dict[word] = lem_dict[lemma]
|
|
||||||
|
|
||||||
l_dict = {v: k for k, v in lem_dict.items()} # switch key/values
|
|
||||||
|
|
||||||
return l_dict, w_dict
|
|
||||||
|
|
||||||
|
|
||||||
lemma_dict, word_dict = create_lemma_dicts()
|
|
||||||
|
|
||||||
def lemmatizeWord(word, l_dict=lemma_dict, w_dict=word_dict, n=3):
|
|
||||||
# mehrmals machen
|
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
try:
|
try:
|
||||||
word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower()
|
word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower()
|
||||||
except:
|
except:
|
||||||
print(word)
|
print(word)
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
|
||||||
def build_thesaurus(path2lexicalentries, path2synsets):
|
|
||||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
|
||||||
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
|
||||||
|
|
||||||
lexroot = lextree.getroot()
|
|
||||||
synroot = syntree.getroot()
|
|
||||||
|
|
||||||
thesaurus = []
|
|
||||||
|
|
||||||
for r in synroot:
|
|
||||||
for element in r:
|
|
||||||
|
|
||||||
if element.tag == "Synset":
|
|
||||||
sysnet = []
|
|
||||||
attrib = element.attrib
|
|
||||||
id = attrib["id"]
|
|
||||||
|
|
||||||
for ro in lexroot:
|
|
||||||
for elem in ro:
|
|
||||||
if elem.tag == "LexicalEntry":
|
|
||||||
subs_dicts = [subentry.attrib for subentry in elem]
|
|
||||||
# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
|
|
||||||
|
|
||||||
dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict
|
|
||||||
if "synset" in dic.keys():
|
|
||||||
if dic["synset"] == id:
|
|
||||||
string = (dic["writtenForm"])
|
|
||||||
|
|
||||||
# replaceRockDots
|
|
||||||
string = re.sub(r'[ß]', "ss", string)
|
|
||||||
string = re.sub(r'[ö]', "oe", string)
|
|
||||||
string = re.sub(r'[ü]', "ue", string)
|
|
||||||
string = re.sub(r'[ä]', "ae", string)
|
|
||||||
|
|
||||||
# alle punkte raus
|
|
||||||
string = re.sub(r'[.]', "", string)
|
|
||||||
|
|
||||||
# alles in klammern raus
|
|
||||||
string = re.sub(r"\((.*)\)", " ", string)
|
|
||||||
|
|
||||||
# längeres leerzeichen normalisieren
|
|
||||||
string = textacy.preprocess.normalize_whitespace(string)
|
|
||||||
|
|
||||||
sysnet.append(string.lower().strip())
|
|
||||||
|
|
||||||
# nach anzhal der wörter in den strings sortieren
|
|
||||||
sysnet.sort(key=lambda x: len(x.split()))
|
|
||||||
if len(sysnet) != 0:
|
|
||||||
# todo warum sind manche leer?
|
|
||||||
thesaurus.append(sysnet)
|
|
||||||
return thesaurus
|
|
||||||
|
|
||||||
printlog("Build Thesaurus")
|
|
||||||
THESAURUS = []
|
|
||||||
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets)
|
|
||||||
|
|
||||||
|
|
||||||
def getFirstSynonym(word, thesaurus=THESAURUS):
|
def getFirstSynonym(word, thesaurus=THESAURUS):
|
||||||
if not isinstance(word, str):
|
if not isinstance(word, str):
|
||||||
return str(word)
|
return str(word)
|
||||||
|
|
||||||
word = word.lower()
|
word = word.lower()
|
||||||
|
|
||||||
# durch den thesaurrus iterieren
|
if word in thesaurus.keys():
|
||||||
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
|
return thesaurus[word]
|
||||||
|
else:
|
||||||
|
return str(word)
|
||||||
|
|
||||||
for syn in syn_block:
|
|
||||||
syn = syn.lower()
|
|
||||||
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist todo phrasen auch normalisieren
|
|
||||||
if word == syn:
|
|
||||||
return syn_block[0]
|
|
||||||
|
|
||||||
return str(word) # zur Not das ursrpüngliche Wort zurückgeben
|
|
||||||
|
|
||||||
|
|
||||||
########################## Spellchecking ##########################################
|
########################## Spellchecking ##########################################
|
||||||
|
@ -570,10 +183,6 @@ from collections import Counter
|
||||||
|
|
||||||
def words(text): return re.findall(r'\w+', text.lower())
|
def words(text): return re.findall(r'\w+', text.lower())
|
||||||
|
|
||||||
printlog("Build Wordlist for Spellchecking")
|
|
||||||
WORDS = {}
|
|
||||||
WORDS = Counter(words(open(path2words).read()))
|
|
||||||
|
|
||||||
def P(word, N=sum(WORDS.values())):
|
def P(word, N=sum(WORDS.values())):
|
||||||
"Probability of `word`."
|
"Probability of `word`."
|
||||||
return WORDS[word] / N
|
return WORDS[word] / N
|
||||||
|
@ -610,18 +219,6 @@ def edits2(word):
|
||||||
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
DE_SPELLCHECKER = enchant.Dict("de_DE")
|
|
||||||
EN_SPELLCHECKER = enchant.Dict("en_US")
|
|
||||||
|
|
||||||
def autocorrectWord(word, spellchecker=DE_SPELLCHECKER):
|
|
||||||
try:
|
|
||||||
return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word
|
|
||||||
except:
|
|
||||||
return word
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def autocorrectWord(word):
|
def autocorrectWord(word):
|
||||||
try:
|
try:
|
||||||
return correction(word)
|
return correction(word)
|
||||||
|
@ -629,15 +226,10 @@ def autocorrectWord(word):
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
|
||||||
##################################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
############# stringcleaning
|
############# stringcleaning
|
||||||
|
|
||||||
def stringcleaning(stringstream):
|
def stringcleaning(stringstream):
|
||||||
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
|
||||||
regex_topLvl = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
|
||||||
|
|
||||||
for string in stringstream:
|
for string in stringstream:
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
|
@ -646,7 +238,7 @@ def stringcleaning(stringstream):
|
||||||
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
||||||
|
|
||||||
# remove_words_containing_topLVL
|
# remove_words_containing_topLVL
|
||||||
string = " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w)])
|
string = " ".join([w.lower() for w in string.split() if not re.search(REGEX_TOPLVL, w)])
|
||||||
|
|
||||||
# replaceRockDots
|
# replaceRockDots
|
||||||
string = re.sub(r'[ß]', "ss", string)
|
string = re.sub(r'[ß]', "ss", string)
|
||||||
|
@ -655,7 +247,7 @@ def stringcleaning(stringstream):
|
||||||
string = re.sub(r'[ä]', "ae", string)
|
string = re.sub(r'[ä]', "ae", string)
|
||||||
|
|
||||||
# seperate_words_on_regex:
|
# seperate_words_on_regex:
|
||||||
string = " ".join(re.compile(regex_specialChars).split(string))
|
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
|
||||||
|
|
||||||
# cut_after
|
# cut_after
|
||||||
word = "gruss"
|
word = "gruss"
|
||||||
|
@ -672,8 +264,27 @@ def stringcleaning(stringstream):
|
||||||
|
|
||||||
yield string
|
yield string
|
||||||
|
|
||||||
|
def filterTokens(tokens, funclist):
|
||||||
|
# in:tokenlist, funclist
|
||||||
|
# out: tokenlist
|
||||||
|
for f in funclist:
|
||||||
|
tokens = list(filter(f, tokens))
|
||||||
|
|
||||||
def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def corpus2Text(corpus):
|
||||||
|
for doc in corpus:
|
||||||
|
yield doc.text
|
||||||
|
|
||||||
|
def corpus2Meta(corpus):
|
||||||
|
for doc in corpus:
|
||||||
|
yield doc.metadata
|
||||||
|
|
||||||
|
def processContentstream(textstream, parser, token_filterlist=None):
|
||||||
"""
|
"""
|
||||||
:param textstream: string-gen
|
:param textstream: string-gen
|
||||||
:param funclist: [func]
|
:param funclist: [func]
|
||||||
|
@ -681,28 +292,6 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
|
||||||
:return: string-gen
|
:return: string-gen
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
|
||||||
filter_tokens=[
|
|
||||||
#removeENT(["PERSON"]),
|
|
||||||
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
|
||||||
#idee rechtschreibkorrektur --> PyEnchant
|
|
||||||
#idee thesaurus --> WordNet, eigener
|
|
||||||
|
|
||||||
remove_words_containing_Numbers(),
|
|
||||||
|
|
||||||
removePOS(["PUNCT","SPACE","NUM"]),
|
|
||||||
|
|
||||||
removeWords(de_stop_words+custom_words),
|
|
||||||
|
|
||||||
remove_long_words(),
|
|
||||||
remove_short_words(),
|
|
||||||
remove_first_names(),
|
|
||||||
|
|
||||||
keepPOS(["NOUN"]),
|
|
||||||
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
|
|
||||||
# pre_parse
|
# pre_parse
|
||||||
textstream = stringcleaning(textstream)
|
textstream = stringcleaning(textstream)
|
||||||
|
|
||||||
|
@ -720,8 +309,7 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
|
||||||
yield " ".join([tok.lower_ for tok in tokens])
|
yield " ".join([tok.lower_ for tok in tokens])
|
||||||
# yield " ".join(list(set([tok.lower_ for tok in tokens])))
|
# yield " ".join(list(set([tok.lower_ for tok in tokens])))
|
||||||
|
|
||||||
|
def processDictstream(dictstream, funcdict, parser):
|
||||||
def processDictstream(dictstream, funcdict, parser=DE_PARSER):
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
:param dictstream: dict-gen
|
:param dictstream: dict-gen
|
||||||
|
@ -754,58 +342,34 @@ def processDictstream(dictstream, funcdict, parser=DE_PARSER):
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
||||||
def filterTokens(tokens, funclist):
|
##################################################################################################
|
||||||
# in:tokenlist, funclist
|
|
||||||
# out: tokenlist
|
|
||||||
for f in funclist:
|
|
||||||
tokens = list(filter(f, tokens))
|
|
||||||
|
|
||||||
return tokens
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
||||||
|
|
||||||
def cleanString(string):
|
|
||||||
# replaceRockDots
|
|
||||||
string = re.sub(r'[ß]', "ss", string)
|
|
||||||
string = re.sub(r'[ö]', "oe", string)
|
|
||||||
string = re.sub(r'[ü]', "ue", string)
|
|
||||||
string = re.sub(r'[ä]', "ae", string)
|
|
||||||
|
|
||||||
|
|
||||||
# längeres leerzeichen normalisieren
|
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
||||||
string = textacy.preprocess.normalize_whitespace(string)
|
|
||||||
|
|
||||||
return(string)
|
path2wordsdict = config.get("spellchecking", "pickle_file")
|
||||||
|
|
||||||
def normalizeTextStream(textstream,clean=False):
|
path2lemmadict = config.get("lemmatization","pickle_file")
|
||||||
"""
|
|
||||||
:param textstream: string-gen
|
|
||||||
:param parser: spacy-parser
|
|
||||||
:yield: string-gen
|
|
||||||
"""
|
|
||||||
|
|
||||||
for txt in textstream:
|
path2nouns_list = config.get("nouns","pickle_file")
|
||||||
if clean:
|
|
||||||
yield cleanString(txt)
|
|
||||||
else:
|
|
||||||
yield textacy.preprocess.normalize_whitespace(txt)
|
|
||||||
|
|
||||||
def nomalizeDictstream(dictstream, clean=False):
|
path2firstnameslist = config.get("firstnames","pickle_file")
|
||||||
"""
|
|
||||||
:param dictstream: dict-gen
|
|
||||||
:param parser: spacy-parser
|
|
||||||
:yield: dict-gen
|
|
||||||
"""
|
|
||||||
|
|
||||||
for dic in dictstream:
|
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
||||||
|
|
||||||
result = {}
|
|
||||||
|
|
||||||
for key, value in dic.items():
|
|
||||||
if clean:
|
|
||||||
result[key] = cleanString(value)
|
|
||||||
else:
|
|
||||||
result[key] = textacy.preprocess.normalize_whitespace(value)
|
|
||||||
yield result
|
|
||||||
|
|
||||||
|
corpus_de_path = config.get("de_corpus", "path")
|
||||||
|
raw_de_name = config.get("de_corpus", "raw")
|
||||||
|
pre_de_name = config.get("de_corpus", "pre")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
corpus_en_path = config.get("en_corpus", "path")
|
||||||
|
raw_en_name = config.get("en_corpus", "raw")
|
||||||
|
pre_en_name = config.get("en_corpus", "pre")
|
||||||
|
|
||||||
|
|
||||||
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
||||||
|
@ -819,6 +383,7 @@ custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanke
|
||||||
"funktionieren", "kollege", "pruefen", "hoffen"
|
"funktionieren", "kollege", "pruefen", "hoffen"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
filter_tokens = [
|
filter_tokens = [
|
||||||
# removeENT(["PERSON"]),
|
# removeENT(["PERSON"]),
|
||||||
# idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
# idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
||||||
|
@ -829,7 +394,8 @@ filter_tokens = [
|
||||||
|
|
||||||
removePOS(["PUNCT", "SPACE", "NUM"]),
|
removePOS(["PUNCT", "SPACE", "NUM"]),
|
||||||
|
|
||||||
removeWords(de_stop_words + custom_words),
|
#removeWords(de_stop_words + custom_words),
|
||||||
|
removeWords(de_stop_words),
|
||||||
|
|
||||||
remove_long_words(),
|
remove_long_words(),
|
||||||
remove_short_words(),
|
remove_short_words(),
|
||||||
|
@ -838,11 +404,7 @@ filter_tokens = [
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
metaliste = [
|
|
||||||
"Subject",
|
|
||||||
"categoryName",
|
|
||||||
"Solution"
|
|
||||||
]
|
|
||||||
|
|
||||||
clean_in_meta = {
|
clean_in_meta = {
|
||||||
"Solution": [removePOS(["SPACE"])],
|
"Solution": [removePOS(["SPACE"])],
|
||||||
|
@ -850,6 +412,78 @@ clean_in_meta = {
|
||||||
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
start = time.time()
|
||||||
|
printlog("Preprocessing: {0}".format(datetime.now()))
|
||||||
|
|
||||||
|
|
||||||
|
THESAURUS = load_obj(path2thesaurus_dict)
|
||||||
|
WORDS = load_obj(path2wordsdict)
|
||||||
|
LEMMAS = load_obj(path2lemmadict)
|
||||||
|
DE_STOP_WORDS = load_obj(path2stopwordlist)
|
||||||
|
NOUNS = load_obj(path2nouns_list)
|
||||||
|
VORNAMEN = load_obj(path2firstnameslist)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#load raw corpus and create new one
|
||||||
|
raw_de_corpus, DE_PARSER = load_corpus(corpus_name=raw_de_name, corpus_path=corpus_de_path)
|
||||||
|
raw_en_corpus, EN_PARSER = load_corpus(corpus_name=raw_en_name, corpus_path=corpus_en_path)
|
||||||
|
|
||||||
|
de_corpus = textacy.Corpus(DE_PARSER)
|
||||||
|
en_corpus = textacy.Corpus(EN_PARSER)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## process and add files to textacy-corpi,
|
||||||
|
printlog("Preprocess and add texts to textacy-corpi")
|
||||||
|
de_corpus.add_texts(
|
||||||
|
processContentstream(corpus2Text(raw_de_corpus), token_filterlist=filter_tokens, parser=DE_PARSER),
|
||||||
|
processDictstream(corpus2Meta(raw_de_corpus), clean_in_meta,parser=raw_de_corpus.lang)
|
||||||
|
)
|
||||||
|
en_corpus.add_texts(
|
||||||
|
processContentstream(corpus2Text(raw_en_corpus), token_filterlist=filter_tokens, parser=EN_PARSER),
|
||||||
|
processDictstream(corpus2Meta(raw_en_corpus), clean_in_meta,parser=raw_en_corpus.lang)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# leere docs aus corpi kicken
|
||||||
|
de_corpus.remove(lambda doc: len(doc) == 0)
|
||||||
|
en_corpus.remove(lambda doc: len(doc) == 0)
|
||||||
|
|
||||||
|
|
||||||
|
for i in range(20):
|
||||||
|
printRandomDoc(de_corpus)
|
||||||
|
#printRandomDoc(en_corpus)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#save corpi
|
||||||
|
save_corpus(corpus=de_corpus, corpus_path=corpus_de_path, corpus_name=pre_de_name)
|
||||||
|
save_corpus(corpus=en_corpus, corpus_path=corpus_en_path, corpus_name=pre_en_name)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
pipe=[
|
pipe=[
|
||||||
|
|
||||||
|
@ -889,37 +523,24 @@ pipe=[
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
de_corpus = textacy.Corpus(DE_PARSER)
|
"""
|
||||||
en_corpus = textacy.Corpus(EN_PARSER)
|
filter_tokens=[
|
||||||
|
#removeENT(["PERSON"]),
|
||||||
|
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
||||||
|
#idee rechtschreibkorrektur --> PyEnchant
|
||||||
|
#idee thesaurus --> WordNet, eigener
|
||||||
|
|
||||||
|
remove_words_containing_Numbers(),
|
||||||
|
|
||||||
|
removePOS(["PUNCT","SPACE","NUM"]),
|
||||||
|
|
||||||
## add files to textacy-corpus,
|
removeWords(de_stop_words+custom_words),
|
||||||
printlog("Add texts to textacy-corpus")
|
|
||||||
de_corpus.add_texts(
|
|
||||||
processContentstream(csv_to_contentStream(path2de_csv, "Description"), token_filterlist=filter_tokens),
|
|
||||||
processDictstream(csv_to_metaStream(path2de_csv, metaliste), clean_in_meta)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
remove_long_words(),
|
||||||
|
remove_short_words(),
|
||||||
|
remove_first_names(),
|
||||||
|
|
||||||
|
keepPOS(["NOUN"]),
|
||||||
|
|
||||||
# leere docs aus corpus kicken
|
]
|
||||||
de_corpus.remove(lambda doc: len(doc) == 0)
|
"""
|
||||||
|
|
||||||
for i in range(20):
|
|
||||||
printRandomDoc(de_corpus)
|
|
||||||
|
|
||||||
|
|
||||||
#save corpus
|
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
|
||||||
corpus_name = "de_corpus"
|
|
||||||
|
|
||||||
save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name)
|
|
||||||
|
|
||||||
|
|
||||||
#todo das selbe mit en_corpus
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
|
||||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
|
||||||
|
|
12
test.py
12
test.py
|
@ -517,8 +517,8 @@ clean_in_content=[
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## add files to textacy-corpus,
|
## add files to textacy-corpi,
|
||||||
printlog("add texts to textacy-corpus")
|
printlog("add texts to textacy-corpi")
|
||||||
ticketcorpus.add_texts(
|
ticketcorpus.add_texts(
|
||||||
processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
|
processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
|
||||||
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
|
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
|
||||||
|
@ -558,7 +558,7 @@ def label2ID(label,labeldict=LABELDICT):
|
||||||
|
|
||||||
def generate_labled_lines(textacyCorpus):
|
def generate_labled_lines(textacyCorpus):
|
||||||
for doc in textacyCorpus:
|
for doc in textacyCorpus:
|
||||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
|
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||||
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
||||||
|
|
||||||
|
|
||||||
|
@ -596,7 +596,7 @@ n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
printlog("vectorize corpus...")
|
printlog("vectorize corpi...")
|
||||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||||
|
|
||||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
|
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
|
||||||
|
@ -620,8 +620,8 @@ printlog("Initialize and train a topic model..")
|
||||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||||
model.fit(doc_term_matrix)
|
model.fit(doc_term_matrix)
|
||||||
|
|
||||||
#Transform the corpus and interpret our model:
|
#Transform the corpi and interpret our model:
|
||||||
printlog("Transform the corpus and interpret our model..")
|
printlog("Transform the corpi and interpret our model..")
|
||||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
28
testo.py
28
testo.py
|
@ -841,15 +841,15 @@ de_corpus = textacy.Corpus(DE_PARSER)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## add files to textacy-corpus,
|
## add files to textacy-corpi,
|
||||||
printlog("add texts to textacy-corpus")
|
printlog("add texts to textacy-corpi")
|
||||||
de_corpus.add_texts(
|
de_corpus.add_texts(
|
||||||
processContentstream(csv_to_contentStream(path2csv,"Description"), token_filterlist=filter_tokens),
|
processContentstream(csv_to_contentStream(path2csv,"Description"), token_filterlist=filter_tokens),
|
||||||
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
|
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# leere docs aus corpus kicken
|
# leere docs aus corpi kicken
|
||||||
de_corpus.remove(lambda doc: len(doc)==0)
|
de_corpus.remove(lambda doc: len(doc)==0)
|
||||||
|
|
||||||
|
|
||||||
|
@ -873,7 +873,7 @@ def printvecotorization(ngrams = 1,min_df = 1,max_df = 1.0,weighting ='tf',named
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#printlog("vectorize corpus...")
|
#printlog("vectorize corpi...")
|
||||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||||
|
|
||||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
|
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
|
||||||
|
@ -908,7 +908,7 @@ printvecotorization(ngrams=(1,2),min_df=1,max_df=0.8,weighting=weighting)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||||
corpus_name = "de_corpus"
|
corpus_name = "de_corpus"
|
||||||
corpus_compression = 'gzip'
|
corpus_compression = 'gzip'
|
||||||
de_corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
de_corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||||
|
@ -951,7 +951,7 @@ def topicModeling(ngrams,min_df,max_df,topicModel = 'lda',n_topics = len(LABELDI
|
||||||
####################'####################
|
####################'####################
|
||||||
|
|
||||||
|
|
||||||
#printlog("vectorize corpus...")
|
#printlog("vectorize corpi...")
|
||||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||||
|
|
||||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
|
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
|
||||||
|
@ -971,8 +971,8 @@ def topicModeling(ngrams,min_df,max_df,topicModel = 'lda',n_topics = len(LABELDI
|
||||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||||
model.fit(doc_term_matrix)
|
model.fit(doc_term_matrix)
|
||||||
|
|
||||||
#Transform the corpus and interpret our model:
|
#Transform the corpi and interpret our model:
|
||||||
#printlog("Transform the corpus and interpret our model..")
|
#printlog("Transform the corpi and interpret our model..")
|
||||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
@ -1016,35 +1016,35 @@ topicModeling(ngrams = 1,
|
||||||
max_df = 1.0,
|
max_df = 1.0,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpus=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = 1,
|
topicModeling(ngrams = 1,
|
||||||
min_df = 0.1,
|
min_df = 0.1,
|
||||||
max_df = 0.6,
|
max_df = 0.6,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpus=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = (1,2),
|
topicModeling(ngrams = (1,2),
|
||||||
min_df = 1,
|
min_df = 1,
|
||||||
max_df = 1.0,
|
max_df = 1.0,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpus=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = (1,2),
|
topicModeling(ngrams = (1,2),
|
||||||
min_df = 0.1,
|
min_df = 0.1,
|
||||||
max_df = 0.6,
|
max_df = 0.6,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpus=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = (1,2),
|
topicModeling(ngrams = (1,2),
|
||||||
min_df = 0.2,
|
min_df = 0.2,
|
||||||
max_df = 0.8,
|
max_df = 0.8,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = 20,
|
n_topics = 20,
|
||||||
corpus=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1124,7 +1124,7 @@ def label2ID(label,labeldict=LABELDICT):
|
||||||
|
|
||||||
def generate_labled_lines(textacyCorpus):
|
def generate_labled_lines(textacyCorpus):
|
||||||
for doc in textacyCorpus:
|
for doc in textacyCorpus:
|
||||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
|
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||||
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
||||||
|
|
||||||
|
|
||||||
|
|
120
testra.py
120
testra.py
|
@ -21,7 +21,7 @@ print(datetime.now())
|
||||||
PARSER=spacy.load("de")
|
PARSER=spacy.load("de")
|
||||||
|
|
||||||
|
|
||||||
corpus = textacy.Corpus(PARSER)
|
corpi = textacy.Corpus(PARSER)
|
||||||
|
|
||||||
testcontetn = [
|
testcontetn = [
|
||||||
"fdsfdsfsd",
|
"fdsfdsfsd",
|
||||||
|
@ -46,12 +46,12 @@ def makemeta( testmetda):
|
||||||
yield metdata
|
yield metdata
|
||||||
|
|
||||||
|
|
||||||
corpus.add_texts(
|
corpi.add_texts(
|
||||||
makecontent(testcontetn),
|
makecontent(testcontetn),
|
||||||
makemeta(testmetda)
|
makemeta(testmetda)
|
||||||
)
|
)
|
||||||
|
|
||||||
print(corpus)
|
print(corpi)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -66,12 +66,79 @@ def load_obj(path ):
|
||||||
return pickle.load(f)
|
return pickle.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# load parser
|
||||||
|
parser = spacy.load(lang)
|
||||||
|
|
||||||
|
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
||||||
|
with open(stringstorepath) as file:
|
||||||
|
parser.vocab.strings.load(file)
|
||||||
|
|
||||||
|
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
|
||||||
|
parser.vocab.load_lexemes(vocabpath)
|
||||||
|
|
||||||
|
corpus = textacy.Corpus(parser)
|
||||||
|
|
||||||
|
|
||||||
|
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||||
|
metapath = corpus_path + corpus_name + "_meta.json"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||||
|
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||||
|
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||||
|
corpus.add_doc(
|
||||||
|
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||||
|
return corpus
|
||||||
|
|
||||||
|
import os
|
||||||
|
a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt"
|
||||||
|
b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt"
|
||||||
|
d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt"
|
||||||
|
|
||||||
|
c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt"
|
||||||
|
|
||||||
|
liste = [a,b,c,d]
|
||||||
|
de_filepaths = [path for path in liste if os.path.basename(path).split("_")[0]=='de' and os.path.basename(path).split("_")[1]=='stopwords']
|
||||||
|
|
||||||
|
from nltk.corpus import stopwords as nltk_stopwords
|
||||||
|
|
||||||
|
from stop_words import get_stop_words
|
||||||
|
import spacy
|
||||||
|
from miscellaneous import *
|
||||||
|
# from packages
|
||||||
|
de_stop_words1 = list(get_stop_words("de"))
|
||||||
|
|
||||||
|
de_stop_words2 = list(nltk_stopwords.words('german'))
|
||||||
|
|
||||||
|
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
|
||||||
|
|
||||||
|
# from files
|
||||||
|
de_stop_words_list = [list(textacy.fileio.read_file_lines(path)) for path in de_filepaths]
|
||||||
|
de_stop_words4 = [item for sublist in de_stop_words_list for item in sublist]
|
||||||
|
#print(de_stop_words4)
|
||||||
|
|
||||||
|
de_stop_words = list(set(map(replaceRockDots(),list(map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
|
||||||
|
print(len(de_stop_words))
|
||||||
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testra.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_testra.log &"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# THESAURUS
|
# THESAURUS
|
||||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
||||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||||
|
|
||||||
|
|
||||||
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||||
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||||
|
@ -159,29 +226,6 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
|
||||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
|
||||||
metapath = corpus_path + corpus_name + "_meta.json"
|
|
||||||
|
|
||||||
# load parser
|
|
||||||
parserpath = corpus_path + str(lang) + '_parser'
|
|
||||||
parser = spacy.load(parserpath)
|
|
||||||
|
|
||||||
corpus = textacy.Corpus(parser)
|
|
||||||
|
|
||||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
|
||||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
|
||||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
|
||||||
corpus.add_doc(
|
|
||||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
|
||||||
return corpus
|
|
||||||
|
|
||||||
#todo load corpus from file idee stringstore und vocab laden
|
|
||||||
|
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
|
||||||
corpus_name = "de_raw_ticketCorpus"
|
|
||||||
|
|
||||||
print(load_corpus(corpus_path, corpus_name))
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
from postal.parser import parse_address
|
from postal.parser import parse_address
|
||||||
|
@ -197,12 +241,12 @@ print(parse_address(address))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||||
corpus_name = "testcorpus"
|
corpus_name = "testcorpus"
|
||||||
|
|
||||||
|
|
||||||
#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
#corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||||
#corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
#corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -225,12 +269,12 @@ def save_corpus(corpus_path,corpus_name):
|
||||||
|
|
||||||
#save content
|
#save content
|
||||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus),contentpath)
|
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpi),contentpath)
|
||||||
|
|
||||||
|
|
||||||
#save meta
|
#save meta
|
||||||
metapath = corpus_path + corpus_name +"_meta.json"
|
metapath = corpus_path + corpus_name +"_meta.json"
|
||||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
textacy.fileio.write_json_lines((doc.metadata for doc in corpi), metapath)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -243,8 +287,8 @@ def load_corpus(corpus_path,corpus_name):
|
||||||
with open(stringstore_path,"r") as file:
|
with open(stringstore_path,"r") as file:
|
||||||
nlp.vocab.strings.load(file)
|
nlp.vocab.strings.load(file)
|
||||||
|
|
||||||
# define corpus
|
# define corpi
|
||||||
corpus = textacy.Corpus(nlp)
|
corpi = textacy.Corpus(nlp)
|
||||||
|
|
||||||
# load meta
|
# load meta
|
||||||
metapath = corpus_path + corpus_name +"_meta.json"
|
metapath = corpus_path + corpus_name +"_meta.json"
|
||||||
|
@ -252,13 +296,13 @@ def load_corpus(corpus_path,corpus_name):
|
||||||
|
|
||||||
#load content
|
#load content
|
||||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
spacy_docs = textacy.fileio.read_spacy_docs(corpi.spacy_vocab, contentpath)
|
||||||
|
|
||||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||||
corpus.add_doc(
|
corpi.add_doc(
|
||||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
textacy.Doc(spacy_doc, lang=corpi.spacy_lang, metadata=metadata))
|
||||||
|
|
||||||
return corpus
|
return corpi
|
||||||
|
|
||||||
|
|
||||||
save_corpus(corpus_path,corpus_name)
|
save_corpus(corpus_path,corpus_name)
|
||||||
|
|
114
topicModeling.py
114
topicModeling.py
|
@ -10,6 +10,46 @@ import time
|
||||||
import enchant
|
import enchant
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from stop_words import get_stop_words
|
||||||
|
|
||||||
|
#import words as words
|
||||||
|
from nltk.corpus import stopwords as nltk_stopwords
|
||||||
|
from collections import Counter
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from scipy import *
|
||||||
|
import sys
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
import pickle
|
||||||
|
import configparser as ConfigParser
|
||||||
|
from miscellaneous import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
import logging
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
import csv
|
||||||
|
import functools
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from scipy import *
|
||||||
|
import sys
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
@ -34,56 +74,6 @@ from postal.parser import parse_address
|
||||||
csv.field_size_limit(sys.maxsize)
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
|
||||||
|
|
||||||
def printlog(string, level="INFO"):
|
|
||||||
"""log and prints"""
|
|
||||||
print(string)
|
|
||||||
if level == "INFO":
|
|
||||||
logging.info(string)
|
|
||||||
elif level == "DEBUG":
|
|
||||||
logging.debug(string)
|
|
||||||
elif level == "WARNING":
|
|
||||||
logging.warning(string)
|
|
||||||
|
|
||||||
|
|
||||||
printlog("Load functions")
|
|
||||||
|
|
||||||
def printRandomDoc(textacyCorpus):
|
|
||||||
import random
|
|
||||||
print()
|
|
||||||
|
|
||||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
|
||||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
|
||||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
|
||||||
textacyCorpus[randIndex].metadata))
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
def load_corpus(corpus_path,corpus_name):
|
|
||||||
# load new lang
|
|
||||||
nlp = spacy.load("de")
|
|
||||||
|
|
||||||
#load stringstore
|
|
||||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
|
||||||
with open(stringstore_path,"r") as file:
|
|
||||||
nlp.vocab.strings.load(file)
|
|
||||||
|
|
||||||
# define corpus
|
|
||||||
corpus = textacy.Corpus(nlp)
|
|
||||||
|
|
||||||
# load meta
|
|
||||||
metapath = corpus_path + corpus_name +"_meta.json"
|
|
||||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
|
||||||
|
|
||||||
#load content
|
|
||||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
|
||||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
|
||||||
|
|
||||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
|
||||||
corpus.add_doc(
|
|
||||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
|
||||||
|
|
||||||
return corpus
|
|
||||||
|
|
||||||
|
|
||||||
def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
|
def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
|
||||||
|
@ -92,7 +82,7 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
|
||||||
printlog(str("max_df: {0}".format(max_df)))
|
printlog(str("max_df: {0}".format(max_df)))
|
||||||
printlog(str("named_entities: {0}".format(named_entities)))
|
printlog(str("named_entities: {0}".format(named_entities)))
|
||||||
|
|
||||||
# printlog("vectorize corpus...")
|
# printlog("vectorize corpi...")
|
||||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||||
|
|
||||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
|
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
|
||||||
|
@ -107,10 +97,10 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||||
corpus_name = "de_corpus"
|
corpus_name = "de_corpus"
|
||||||
|
|
||||||
# load corpus
|
# load corpi
|
||||||
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
|
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
|
||||||
|
|
||||||
|
|
||||||
|
@ -172,7 +162,7 @@ def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(
|
||||||
####################'####################
|
####################'####################
|
||||||
|
|
||||||
|
|
||||||
# printlog("vectorize corpus...")
|
# printlog("vectorize corpi...")
|
||||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||||
|
|
||||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
|
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
|
||||||
|
@ -191,8 +181,8 @@ def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(
|
||||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||||
model.fit(doc_term_matrix)
|
model.fit(doc_term_matrix)
|
||||||
|
|
||||||
# Transform the corpus and interpret our model:
|
# Transform the corpi and interpret our model:
|
||||||
# printlog("Transform the corpus and interpret our model..")
|
# printlog("Transform the corpi and interpret our model..")
|
||||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
@ -228,35 +218,35 @@ topicModeling(ngrams = 1,
|
||||||
max_df = 1.0,
|
max_df = 1.0,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpus=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = 1,
|
topicModeling(ngrams = 1,
|
||||||
min_df = 0.1,
|
min_df = 0.1,
|
||||||
max_df = 0.6,
|
max_df = 0.6,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpus=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = (1,2),
|
topicModeling(ngrams = (1,2),
|
||||||
min_df = 1,
|
min_df = 1,
|
||||||
max_df = 1.0,
|
max_df = 1.0,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpus=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = (1,2),
|
topicModeling(ngrams = (1,2),
|
||||||
min_df = 0.1,
|
min_df = 0.1,
|
||||||
max_df = 0.6,
|
max_df = 0.6,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpus=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = (1,2),
|
topicModeling(ngrams = (1,2),
|
||||||
min_df = 0.2,
|
min_df = 0.2,
|
||||||
max_df = 0.8,
|
max_df = 0.8,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = 20,
|
n_topics = 20,
|
||||||
corpus=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -292,7 +282,7 @@ def label2ID(label, labeldict=LABELDICT):
|
||||||
|
|
||||||
def generate_labled_lines(textacyCorpus):
|
def generate_labled_lines(textacyCorpus):
|
||||||
for doc in textacyCorpus:
|
for doc in textacyCorpus:
|
||||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
|
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||||
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue