weiter aufgeräumt
This commit is contained in:
parent
56c8bce2d7
commit
17e45c30af
51
config.ini
51
config.ini
|
@ -1,58 +1,53 @@
|
|||
[thesaurus]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_dict.pkl
|
||||
input = deWordNet.xml
|
||||
pickle_file = thesaurus_dict.pkl
|
||||
|
||||
|
||||
[spellchecking]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_dict.pkl
|
||||
input = deu_news_2015_1M-sentences.txt
|
||||
pickle_file = words_dict.pkl
|
||||
|
||||
|
||||
[lemmatization]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl
|
||||
input = lemmas.txt
|
||||
pickle_file = lemma_dict.pkl
|
||||
|
||||
|
||||
[nouns]
|
||||
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt
|
||||
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl
|
||||
input1 = nomen.txt
|
||||
input2 = nomen2.txt
|
||||
pickle_file = nouns_list.pkl
|
||||
|
||||
|
||||
[firstnames]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl
|
||||
input = firstnames.txt
|
||||
pickle_file = firstnames_list.pkl
|
||||
|
||||
|
||||
[de_stopwords]
|
||||
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt
|
||||
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt
|
||||
input3 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl
|
||||
input1 = de_stopwords_1.txt
|
||||
input2 = de_stopwords_2.txt
|
||||
input3 = de_stopwords_3.txt
|
||||
pickle_file = stopwords_list.pkl
|
||||
|
||||
|
||||
[logging]
|
||||
level = INFO
|
||||
filename = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
|
||||
filename = topicModelTickets.log
|
||||
|
||||
|
||||
[de_corpus]
|
||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv
|
||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
|
||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv
|
||||
|
||||
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
|
||||
raw = de_raw_ticket
|
||||
pre = de_pre_ticket
|
||||
#input = M42-Export/Tickets_med.csv
|
||||
#input = M42-Export/Tickets_small.csv
|
||||
#input = M42-Export/Tickets_mini.csv
|
||||
input = M42-Export/de_tickets.csv
|
||||
|
||||
path = corpi/
|
||||
|
||||
[en_corpus]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv
|
||||
input = M42-Export/en_tickets.csv
|
||||
|
||||
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
|
||||
raw = en_raw_ticket
|
||||
pre = en_pre_ticket
|
||||
path = corpi/
|
||||
|
||||
|
||||
|
||||
|
|
108
corporization.py
108
corporization.py
|
@ -1,53 +1,20 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from stop_words import get_stop_words
|
||||
|
||||
#import words as words
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
import configparser as ConfigParser
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
import time
|
||||
|
||||
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import os
|
||||
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
config_ini = FILEPATH + "config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
|
@ -128,70 +95,61 @@ metaliste = [
|
|||
|
||||
|
||||
content_collumn_name = config.get("tickets","content_collumn_name")
|
||||
metaliste = config.get("tickets","metaliste")
|
||||
metaliste = config.get("tickets","metaliste").split(",")
|
||||
|
||||
|
||||
path2de_csv = config.get("de_corpus","input")
|
||||
corpus_de_path = config.get("de_corpus", "path")
|
||||
raw_de_name = config.get("de_corpus", "raw")
|
||||
path2de_csv = FILEPATH + config.get("de_corpus","input")
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
|
||||
|
||||
path2en_csv = config.get("en_corpus","input")
|
||||
corpus_en_path = config.get("en_corpus", "path")
|
||||
raw_en_name = config.get("en_corpus", "raw")
|
||||
path2en_csv = FILEPATH + config.get("en_corpus","input")
|
||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
start = time.time()
|
||||
printlog("Corporization: {0}".format(datetime.now()))
|
||||
|
||||
|
||||
#print paths
|
||||
path_csv_split = path2de_csv.split("/")
|
||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
||||
path_csv_split = path2en_csv.split("/")
|
||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
||||
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
|
||||
|
||||
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
EN_PARSER = spacy.load("en")
|
||||
# print paths
|
||||
path_csv_split = path2_csv.split("/")
|
||||
filename = path_csv_split[len(path_csv_split) - 1]
|
||||
|
||||
raw_de_corpus = textacy.Corpus(DE_PARSER)
|
||||
raw_en_corpus = textacy.Corpus(EN_PARSER)
|
||||
printlog("Corporization of {0} at {1}".format(filename,datetime.now()))
|
||||
|
||||
|
||||
raw_corpus = textacy.Corpus(lang)
|
||||
|
||||
## add files to textacy-corpi,
|
||||
printlog("Add texts to textacy-corpi")
|
||||
printlog("Add texts to {0}_textacy-corpi".format(lang))
|
||||
|
||||
raw_de_corpus.add_texts(
|
||||
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
|
||||
ticket_csv_to_DictStream(path2de_csv, metaliste)
|
||||
)
|
||||
|
||||
raw_en_corpus.add_texts(
|
||||
ticketcsv_to_textStream(path2en_csv, content_collumn_name),
|
||||
ticket_csv_to_DictStream(path2en_csv, metaliste)
|
||||
raw_corpus.add_texts(
|
||||
ticketcsv_to_textStream(path2_csv, content_collumn_name),
|
||||
ticket_csv_to_DictStream(path2_csv, metaliste)
|
||||
)
|
||||
|
||||
|
||||
# leere docs aus corpi kicken
|
||||
raw_de_corpus.remove(lambda doc: len(doc) == 0)
|
||||
raw_en_corpus.remove(lambda doc: len(doc) == 0)
|
||||
raw_corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
#random Doc printen
|
||||
for i in range(printrandom):
|
||||
printRandomDoc(raw_corpus)
|
||||
|
||||
|
||||
#for i in range(20):
|
||||
# printRandomDoc(raw_de_corpus)
|
||||
# printRandomDoc(raw_en_corpus)
|
||||
# save corpus
|
||||
raw_name = lang + "_raw_ticket"
|
||||
save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
|
||||
|
||||
|
||||
#save corpi
|
||||
save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name)
|
||||
save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name)
|
||||
def main():
|
||||
start = time.time()
|
||||
|
||||
|
||||
ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de")
|
||||
|
||||
ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en")
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60))
|
||||
|
|
67
init.py
67
init.py
|
@ -1,29 +1,24 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from stop_words import get_stop_words
|
||||
|
||||
#import words as words
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
import configparser as ConfigParser
|
||||
from miscellaneous import *
|
||||
from stop_words import get_stop_words
|
||||
import csv
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
|
||||
from collections import Counter
|
||||
import time
|
||||
from datetime import datetime
|
||||
import os
|
||||
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
|
||||
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
config_ini = FILEPATH + "config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
|
@ -70,7 +65,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
|
|||
|
||||
Result will be used as lemma_dict["word"] --> lemma
|
||||
|
||||
:param path2lexicalentries: str
|
||||
:param path2wordnet: str
|
||||
:param returnall: bool if True, also return , word2synsets, synset2Words
|
||||
:return: dictionaries: thesaurus
|
||||
"""
|
||||
|
@ -242,34 +237,34 @@ def build_words_for_spellchecking(path2words):
|
|||
##################################################################################################
|
||||
|
||||
# THESAURUS
|
||||
path2wordnet = config.get("thesaurus","input")
|
||||
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
||||
path2wordnet = FILEPATH + config.get("thesaurus","input")
|
||||
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
path2words_file = config.get("spellchecking","input")
|
||||
path2wordlist = config.get("spellchecking","pickle_file")
|
||||
path2words_file = FILEPATH + config.get("spellchecking","input")
|
||||
path2wordlist = FILEPATH + config.get("spellchecking","pickle_file")
|
||||
|
||||
|
||||
# LEMMA
|
||||
path2lemma_file = config.get("lemmatization","input")
|
||||
path2lemmadict = config.get("lemmatization","pickle_file")
|
||||
path2lemma_file = FILEPATH + config.get("lemmatization","input")
|
||||
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
|
||||
|
||||
# NOMEN
|
||||
nouns1 = config.get("nouns","input1")
|
||||
nouns2 = config.get("nouns","input2")
|
||||
path2nouns_list = config.get("nouns","pickle_file")
|
||||
nouns1 = FILEPATH + config.get("nouns","input1")
|
||||
nouns2 = FILEPATH + config.get("nouns","input2")
|
||||
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
|
||||
|
||||
|
||||
# VORNAMEN
|
||||
firstnames_txt = config.get("firstnames","input")
|
||||
path2firstnameslist = config.get("firstnames","pickle_file")
|
||||
firstnames_txt = FILEPATH + config.get("firstnames","input")
|
||||
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
|
||||
|
||||
# STOPWORDS
|
||||
stop1 = config.get("de_stopwords","input1")
|
||||
stop2 = config.get("de_stopwords","input2")
|
||||
stop3 = config.get("de_stopwords","input3")
|
||||
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
||||
stop1 = FILEPATH + config.get("de_stopwords","input1")
|
||||
stop2 = FILEPATH + config.get("de_stopwords","input2")
|
||||
stop3 = FILEPATH + config.get("de_stopwords","input3")
|
||||
path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")
|
||||
|
||||
|
||||
|
||||
|
|
6
main.py
6
main.py
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import time
|
||||
import init
|
||||
import corporization
|
||||
import preprocessing
|
||||
|
@ -8,7 +8,7 @@ from miscellaneous import *
|
|||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
||||
|
||||
start = time.time()
|
||||
|
||||
init.main()
|
||||
printlog("")
|
||||
|
@ -19,3 +19,5 @@ printlog("")
|
|||
preprocessing.main()
|
||||
printlog("")
|
||||
|
||||
end = time.time()
|
||||
printlog("Total Time Elapsed: {0} min".format((end - start) / 60))
|
||||
|
|
|
@ -1,87 +1,25 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import random
|
||||
|
||||
import time
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
import time
|
||||
start = time.time()
|
||||
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
import time
|
||||
|
||||
import enchant
|
||||
|
||||
start = time.time()
|
||||
|
||||
import logging
|
||||
|
||||
import csv
|
||||
import functools
|
||||
import os.path
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
import sys
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
from textacy import Vectorizer
|
||||
import warnings
|
||||
import configparser as ConfigParser
|
||||
import sys
|
||||
import hunspell
|
||||
from postal.parser import parse_address
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import functools
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
import os
|
||||
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
|
||||
|
||||
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
config_ini = FILEPATH + "config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
|
@ -90,7 +28,7 @@ with open(config_ini) as f:
|
|||
|
||||
|
||||
# config logging
|
||||
filename = config.get("logging","filename")
|
||||
filename = FILEPATH + config.get("logging","filename")
|
||||
level = config.get("logging","level")
|
||||
if level == "INFO":
|
||||
level = logging.INFO
|
||||
|
@ -188,6 +126,9 @@ def printRandomDoc(textacyCorpus):
|
|||
:param textacyCorpus:
|
||||
"""
|
||||
print()
|
||||
if len(textacyCorpus) == 0:
|
||||
printlog("NO DOCS IN CORPUS")
|
||||
else:
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
|
@ -239,14 +180,14 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
|||
Load textacy-Corpus including spacy-parser out from file
|
||||
:param corpus_path: str
|
||||
:param corpus_name: str (should content the language like "_de_")
|
||||
:param lang: str language code)
|
||||
:param lang: str (language code) ir spacy.Language
|
||||
:return: texracy.Corpus, spacy.language
|
||||
"""
|
||||
|
||||
#ckeck for language
|
||||
if "_de_" in corpus_name:
|
||||
if "de_" in corpus_name:
|
||||
lang="de"
|
||||
elif "_en_" in corpus_name:
|
||||
elif "en_" in corpus_name:
|
||||
lang ="en"
|
||||
|
||||
|
||||
|
|
163
preprocessing.py
163
preprocessing.py
|
@ -1,76 +1,23 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
print(datetime.now())
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from stop_words import get_stop_words
|
||||
|
||||
#import words as words
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
import configparser as ConfigParser
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
import time
|
||||
|
||||
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
import time
|
||||
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
|
||||
import os
|
||||
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
|
||||
import pickle
|
||||
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
|
||||
|
||||
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
config_ini = FILEPATH + "config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
|
@ -178,7 +125,6 @@ def getFirstSynonym(word, thesaurus=THESAURUS):
|
|||
# http://wortschatz.uni-leipzig.de/en/download
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
@ -250,7 +196,7 @@ def stringcleaning(stringstream):
|
|||
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
|
||||
|
||||
# cut_after
|
||||
word = "gruss"
|
||||
word = "gruss" #idee addressen enfernen --> postal.parser
|
||||
string = string.rpartition(word)[0] if word in string else string
|
||||
|
||||
# lemmatize
|
||||
|
@ -347,29 +293,23 @@ def processDictstream(dictstream, funcdict, parser):
|
|||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
||||
|
||||
|
||||
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
||||
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
|
||||
|
||||
path2wordsdict = config.get("spellchecking", "pickle_file")
|
||||
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
|
||||
|
||||
path2lemmadict = config.get("lemmatization","pickle_file")
|
||||
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
|
||||
|
||||
path2nouns_list = config.get("nouns","pickle_file")
|
||||
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
|
||||
|
||||
path2firstnameslist = config.get("firstnames","pickle_file")
|
||||
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
|
||||
|
||||
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
||||
path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")
|
||||
|
||||
|
||||
|
||||
corpus_de_path = config.get("de_corpus", "path")
|
||||
raw_de_name = config.get("de_corpus", "raw")
|
||||
pre_de_name = config.get("de_corpus", "pre")
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
|
||||
|
||||
|
||||
corpus_en_path = config.get("en_corpus", "path")
|
||||
raw_en_name = config.get("en_corpus", "raw")
|
||||
pre_en_name = config.get("en_corpus", "pre")
|
||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||
|
||||
|
||||
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
||||
|
@ -383,10 +323,8 @@ custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanke
|
|||
"funktionieren", "kollege", "pruefen", "hoffen"
|
||||
]
|
||||
|
||||
|
||||
filter_tokens = [
|
||||
# removeENT(["PERSON"]),
|
||||
# idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
||||
|
||||
keepNouns(),
|
||||
|
||||
|
@ -403,8 +341,8 @@ filter_tokens = [
|
|||
|
||||
|
||||
]
|
||||
|
||||
|
||||
#todo filtertokens haut alle raus
|
||||
filter_tokens = None
|
||||
|
||||
clean_in_meta = {
|
||||
"Solution": [removePOS(["SPACE"])],
|
||||
|
@ -412,17 +350,43 @@ clean_in_meta = {
|
|||
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||
}
|
||||
|
||||
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
|
||||
|
||||
printlog("Preprocess {0}_corpus at {1}".format(lang,datetime.now()))
|
||||
|
||||
rawCorpus_name = lang + "_raw_ticket"
|
||||
preCorpus_name = lang + "_pre_ticket"
|
||||
|
||||
#load raw corpus and create new one
|
||||
raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)
|
||||
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
## process and add files to textacy-corpi,
|
||||
corpus.add_texts(
|
||||
processContentstream(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser),
|
||||
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
|
||||
)
|
||||
|
||||
|
||||
# leere docs aus corpi kicken
|
||||
corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
|
||||
for i in range(printrandom):
|
||||
printRandomDoc(corpus)
|
||||
|
||||
|
||||
|
||||
|
||||
#save corpus
|
||||
save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name)
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
start = time.time()
|
||||
printlog("Preprocessing: {0}".format(datetime.now()))
|
||||
|
||||
|
||||
THESAURUS = load_obj(path2thesaurus_dict)
|
||||
|
@ -432,44 +396,9 @@ def main():
|
|||
NOUNS = load_obj(path2nouns_list)
|
||||
VORNAMEN = load_obj(path2firstnameslist)
|
||||
|
||||
preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" )
|
||||
|
||||
|
||||
#load raw corpus and create new one
|
||||
raw_de_corpus, DE_PARSER = load_corpus(corpus_name=raw_de_name, corpus_path=corpus_de_path)
|
||||
raw_en_corpus, EN_PARSER = load_corpus(corpus_name=raw_en_name, corpus_path=corpus_en_path)
|
||||
|
||||
de_corpus = textacy.Corpus(DE_PARSER)
|
||||
en_corpus = textacy.Corpus(EN_PARSER)
|
||||
|
||||
|
||||
|
||||
## process and add files to textacy-corpi,
|
||||
printlog("Preprocess and add texts to textacy-corpi")
|
||||
de_corpus.add_texts(
|
||||
processContentstream(corpus2Text(raw_de_corpus), token_filterlist=filter_tokens, parser=DE_PARSER),
|
||||
processDictstream(corpus2Meta(raw_de_corpus), clean_in_meta,parser=raw_de_corpus.lang)
|
||||
)
|
||||
en_corpus.add_texts(
|
||||
processContentstream(corpus2Text(raw_en_corpus), token_filterlist=filter_tokens, parser=EN_PARSER),
|
||||
processDictstream(corpus2Meta(raw_en_corpus), clean_in_meta,parser=raw_en_corpus.lang)
|
||||
)
|
||||
|
||||
|
||||
# leere docs aus corpi kicken
|
||||
de_corpus.remove(lambda doc: len(doc) == 0)
|
||||
en_corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
|
||||
for i in range(20):
|
||||
printRandomDoc(de_corpus)
|
||||
#printRandomDoc(en_corpus)
|
||||
|
||||
|
||||
|
||||
#save corpi
|
||||
save_corpus(corpus=de_corpus, corpus_path=corpus_de_path, corpus_name=pre_de_name)
|
||||
save_corpus(corpus=en_corpus, corpus_path=corpus_en_path, corpus_name=pre_en_name)
|
||||
|
||||
preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
|
||||
|
||||
|
||||
end = time.time()
|
||||
|
|
52
testra.py
52
testra.py
|
@ -1,10 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
|
||||
import spacy
|
||||
import textacy
|
||||
#import spacy
|
||||
#import textacy
|
||||
from functools import reduce
|
||||
|
||||
start = time.time()
|
||||
|
@ -15,7 +16,6 @@ from datetime import datetime
|
|||
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
print(datetime.now())
|
||||
|
||||
"""
|
||||
PARSER=spacy.load("de")
|
||||
|
@ -55,19 +55,8 @@ print(corpi)
|
|||
"""
|
||||
|
||||
|
||||
import pickle
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path + '.pkl', 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path ):
|
||||
with open(path + '.pkl', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -95,7 +84,7 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
|||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
return corpus
|
||||
|
||||
"""
|
||||
import os
|
||||
a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt"
|
||||
b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt"
|
||||
|
@ -103,37 +92,16 @@ d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.
|
|||
|
||||
c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt"
|
||||
|
||||
liste = [a,b,c,d]
|
||||
de_filepaths = [path for path in liste if os.path.basename(path).split("_")[0]=='de' and os.path.basename(path).split("_")[1]=='stopwords']
|
||||
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
|
||||
from stop_words import get_stop_words
|
||||
import spacy
|
||||
from miscellaneous import *
|
||||
# from packages
|
||||
de_stop_words1 = list(get_stop_words("de"))
|
||||
|
||||
de_stop_words2 = list(nltk_stopwords.words('german'))
|
||||
|
||||
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
# from files
|
||||
de_stop_words_list = [list(textacy.fileio.read_file_lines(path)) for path in de_filepaths]
|
||||
de_stop_words4 = [item for sublist in de_stop_words_list for item in sublist]
|
||||
#print(de_stop_words4)
|
||||
|
||||
de_stop_words = list(set(map(replaceRockDots(),list(map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
|
||||
print(len(de_stop_words))
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testra.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_testra.log &"
|
||||
|
||||
|
||||
|
||||
scriptpath = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
# THESAURUS
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||
|
@ -212,7 +180,7 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|||
pass
|
||||
return thesaurus
|
||||
|
||||
"""
|
||||
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
|
||||
|
@ -223,9 +191,8 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|||
|
||||
if id not in synset2Words.keys():
|
||||
synset2Words[id] = "WORD"
|
||||
"""
|
||||
|
||||
|
||||
"""
|
||||
|
||||
"""
|
||||
from postal.parser import parse_address
|
||||
|
@ -557,6 +524,7 @@ de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
|
|||
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
|
||||
|
||||
"""
|
||||
|
||||
end = time.time()
|
||||
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
||||
|
||||
|
|
|
@ -94,9 +94,6 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
|
|||
printlog("doc_term_matrix: {0}".format(doc_term_matrix))
|
||||
printlog("id2term: {0}".format(id2term))
|
||||
|
||||
|
||||
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||
corpus_name = "de_corpus"
|
||||
|
||||
|
@ -104,11 +101,6 @@ corpus_name = "de_corpus"
|
|||
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
|
||||
|
||||
|
||||
for i in range(5):
|
||||
printRandomDoc(de_corpus)
|
||||
|
||||
|
||||
|
||||
|
||||
# todo gescheites tf(-idf) maß finden
|
||||
ngrams = 1
|
||||
|
@ -128,6 +120,7 @@ printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
|
|||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
|
||||
"""
|
||||
|
||||
|
||||
# build citionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
|
@ -139,6 +132,7 @@ LABELDICT = {k: v for v, k in enumerate(labelist)}
|
|||
printlog(str("LABELDICT: {0}".format(LABELDICT)))
|
||||
|
||||
|
||||
|
||||
def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False,
|
||||
corpus=de_corpus):
|
||||
printlog(
|
||||
|
|
Loading…
Reference in New Issue