weiter aufgeräumt
This commit is contained in:
parent
56c8bce2d7
commit
17e45c30af
51
config.ini
51
config.ini
|
@ -1,58 +1,53 @@
|
||||||
[thesaurus]
|
[thesaurus]
|
||||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml
|
input = deWordNet.xml
|
||||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_dict.pkl
|
pickle_file = thesaurus_dict.pkl
|
||||||
|
|
||||||
|
|
||||||
[spellchecking]
|
[spellchecking]
|
||||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt
|
input = deu_news_2015_1M-sentences.txt
|
||||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_dict.pkl
|
pickle_file = words_dict.pkl
|
||||||
|
|
||||||
|
|
||||||
[lemmatization]
|
[lemmatization]
|
||||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt
|
input = lemmas.txt
|
||||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl
|
pickle_file = lemma_dict.pkl
|
||||||
|
|
||||||
|
|
||||||
[nouns]
|
[nouns]
|
||||||
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt
|
input1 = nomen.txt
|
||||||
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt
|
input2 = nomen2.txt
|
||||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl
|
pickle_file = nouns_list.pkl
|
||||||
|
|
||||||
|
|
||||||
[firstnames]
|
[firstnames]
|
||||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt
|
input = firstnames.txt
|
||||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl
|
pickle_file = firstnames_list.pkl
|
||||||
|
|
||||||
|
|
||||||
[de_stopwords]
|
[de_stopwords]
|
||||||
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt
|
input1 = de_stopwords_1.txt
|
||||||
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt
|
input2 = de_stopwords_2.txt
|
||||||
input3 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt
|
input3 = de_stopwords_3.txt
|
||||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl
|
pickle_file = stopwords_list.pkl
|
||||||
|
|
||||||
|
|
||||||
[logging]
|
[logging]
|
||||||
level = INFO
|
level = INFO
|
||||||
filename = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
|
filename = topicModelTickets.log
|
||||||
|
|
||||||
|
|
||||||
[de_corpus]
|
[de_corpus]
|
||||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv
|
#input = M42-Export/Tickets_med.csv
|
||||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
|
#input = M42-Export/Tickets_small.csv
|
||||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv
|
#input = M42-Export/Tickets_mini.csv
|
||||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv
|
input = M42-Export/de_tickets.csv
|
||||||
|
|
||||||
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
|
|
||||||
raw = de_raw_ticket
|
|
||||||
pre = de_pre_ticket
|
|
||||||
|
|
||||||
|
path = corpi/
|
||||||
|
|
||||||
[en_corpus]
|
[en_corpus]
|
||||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv
|
input = M42-Export/en_tickets.csv
|
||||||
|
|
||||||
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
|
path = corpi/
|
||||||
raw = en_raw_ticket
|
|
||||||
pre = en_pre_ticket
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
110
corporization.py
110
corporization.py
|
@ -1,53 +1,20 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
from stop_words import get_stop_words
|
|
||||||
|
|
||||||
#import words as words
|
|
||||||
from nltk.corpus import stopwords as nltk_stopwords
|
|
||||||
from collections import Counter
|
|
||||||
import csv
|
import csv
|
||||||
import re
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import spacy
|
|
||||||
import textacy
|
|
||||||
from scipy import *
|
|
||||||
import sys
|
import sys
|
||||||
csv.field_size_limit(sys.maxsize)
|
|
||||||
import pickle
|
|
||||||
import configparser as ConfigParser
|
|
||||||
from miscellaneous import *
|
from miscellaneous import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import logging
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
import csv
|
|
||||||
import functools
|
|
||||||
import re
|
import re
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import spacy
|
|
||||||
import textacy
|
import textacy
|
||||||
from scipy import *
|
from scipy import *
|
||||||
import sys
|
import os
|
||||||
csv.field_size_limit(sys.maxsize)
|
|
||||||
|
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||||
|
|
||||||
# load config
|
# load config
|
||||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
config_ini = FILEPATH + "config.ini"
|
||||||
|
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
with open(config_ini) as f:
|
with open(config_ini) as f:
|
||||||
|
@ -128,70 +95,61 @@ metaliste = [
|
||||||
|
|
||||||
|
|
||||||
content_collumn_name = config.get("tickets","content_collumn_name")
|
content_collumn_name = config.get("tickets","content_collumn_name")
|
||||||
metaliste = config.get("tickets","metaliste")
|
metaliste = config.get("tickets","metaliste").split(",")
|
||||||
|
|
||||||
|
|
||||||
path2de_csv = config.get("de_corpus","input")
|
path2de_csv = FILEPATH + config.get("de_corpus","input")
|
||||||
corpus_de_path = config.get("de_corpus", "path")
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
raw_de_name = config.get("de_corpus", "raw")
|
|
||||||
|
|
||||||
|
|
||||||
path2en_csv = config.get("en_corpus","input")
|
path2en_csv = FILEPATH + config.get("en_corpus","input")
|
||||||
corpus_en_path = config.get("en_corpus", "path")
|
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||||
raw_en_name = config.get("en_corpus", "raw")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
|
||||||
start = time.time()
|
|
||||||
printlog("Corporization: {0}".format(datetime.now()))
|
|
||||||
|
|
||||||
|
|
||||||
# print paths
|
# print paths
|
||||||
path_csv_split = path2de_csv.split("/")
|
path_csv_split = path2_csv.split("/")
|
||||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
filename = path_csv_split[len(path_csv_split) - 1]
|
||||||
path_csv_split = path2en_csv.split("/")
|
|
||||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
printlog("Corporization of {0} at {1}".format(filename,datetime.now()))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
DE_PARSER = spacy.load("de")
|
|
||||||
EN_PARSER = spacy.load("en")
|
|
||||||
|
|
||||||
raw_de_corpus = textacy.Corpus(DE_PARSER)
|
|
||||||
raw_en_corpus = textacy.Corpus(EN_PARSER)
|
|
||||||
|
|
||||||
|
|
||||||
|
raw_corpus = textacy.Corpus(lang)
|
||||||
|
|
||||||
## add files to textacy-corpi,
|
## add files to textacy-corpi,
|
||||||
printlog("Add texts to textacy-corpi")
|
printlog("Add texts to {0}_textacy-corpi".format(lang))
|
||||||
|
|
||||||
raw_de_corpus.add_texts(
|
raw_corpus.add_texts(
|
||||||
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
|
ticketcsv_to_textStream(path2_csv, content_collumn_name),
|
||||||
ticket_csv_to_DictStream(path2de_csv, metaliste)
|
ticket_csv_to_DictStream(path2_csv, metaliste)
|
||||||
)
|
|
||||||
|
|
||||||
raw_en_corpus.add_texts(
|
|
||||||
ticketcsv_to_textStream(path2en_csv, content_collumn_name),
|
|
||||||
ticket_csv_to_DictStream(path2en_csv, metaliste)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# leere docs aus corpi kicken
|
# leere docs aus corpi kicken
|
||||||
raw_de_corpus.remove(lambda doc: len(doc) == 0)
|
raw_corpus.remove(lambda doc: len(doc) == 0)
|
||||||
raw_en_corpus.remove(lambda doc: len(doc) == 0)
|
|
||||||
|
#random Doc printen
|
||||||
|
for i in range(printrandom):
|
||||||
|
printRandomDoc(raw_corpus)
|
||||||
|
|
||||||
|
|
||||||
#for i in range(20):
|
# save corpus
|
||||||
# printRandomDoc(raw_de_corpus)
|
raw_name = lang + "_raw_ticket"
|
||||||
# printRandomDoc(raw_en_corpus)
|
save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
|
||||||
|
|
||||||
|
|
||||||
#save corpi
|
def main():
|
||||||
save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name)
|
start = time.time()
|
||||||
save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name)
|
|
||||||
|
|
||||||
|
|
||||||
|
ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de")
|
||||||
|
|
||||||
|
ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en")
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60))
|
printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60))
|
||||||
|
|
67
init.py
67
init.py
|
@ -1,29 +1,24 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
from stop_words import get_stop_words
|
|
||||||
|
|
||||||
#import words as words
|
|
||||||
from nltk.corpus import stopwords as nltk_stopwords
|
|
||||||
from collections import Counter
|
|
||||||
import csv
|
|
||||||
import re
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import spacy
|
|
||||||
import textacy
|
|
||||||
from scipy import *
|
|
||||||
import sys
|
|
||||||
csv.field_size_limit(sys.maxsize)
|
|
||||||
import pickle
|
|
||||||
import configparser as ConfigParser
|
|
||||||
from miscellaneous import *
|
from miscellaneous import *
|
||||||
|
from stop_words import get_stop_words
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
from nltk.corpus import stopwords as nltk_stopwords
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
import os
|
||||||
|
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||||
|
|
||||||
|
|
||||||
# load config
|
# load config
|
||||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
config_ini = FILEPATH + "config.ini"
|
||||||
|
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
with open(config_ini) as f:
|
with open(config_ini) as f:
|
||||||
|
@ -70,7 +65,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
|
||||||
|
|
||||||
Result will be used as lemma_dict["word"] --> lemma
|
Result will be used as lemma_dict["word"] --> lemma
|
||||||
|
|
||||||
:param path2lexicalentries: str
|
:param path2wordnet: str
|
||||||
:param returnall: bool if True, also return , word2synsets, synset2Words
|
:param returnall: bool if True, also return , word2synsets, synset2Words
|
||||||
:return: dictionaries: thesaurus
|
:return: dictionaries: thesaurus
|
||||||
"""
|
"""
|
||||||
|
@ -242,34 +237,34 @@ def build_words_for_spellchecking(path2words):
|
||||||
##################################################################################################
|
##################################################################################################
|
||||||
|
|
||||||
# THESAURUS
|
# THESAURUS
|
||||||
path2wordnet = config.get("thesaurus","input")
|
path2wordnet = FILEPATH + config.get("thesaurus","input")
|
||||||
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
|
||||||
|
|
||||||
|
|
||||||
# SPELLCHECKING
|
# SPELLCHECKING
|
||||||
path2words_file = config.get("spellchecking","input")
|
path2words_file = FILEPATH + config.get("spellchecking","input")
|
||||||
path2wordlist = config.get("spellchecking","pickle_file")
|
path2wordlist = FILEPATH + config.get("spellchecking","pickle_file")
|
||||||
|
|
||||||
|
|
||||||
# LEMMA
|
# LEMMA
|
||||||
path2lemma_file = config.get("lemmatization","input")
|
path2lemma_file = FILEPATH + config.get("lemmatization","input")
|
||||||
path2lemmadict = config.get("lemmatization","pickle_file")
|
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
|
||||||
|
|
||||||
# NOMEN
|
# NOMEN
|
||||||
nouns1 = config.get("nouns","input1")
|
nouns1 = FILEPATH + config.get("nouns","input1")
|
||||||
nouns2 = config.get("nouns","input2")
|
nouns2 = FILEPATH + config.get("nouns","input2")
|
||||||
path2nouns_list = config.get("nouns","pickle_file")
|
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
|
||||||
|
|
||||||
|
|
||||||
# VORNAMEN
|
# VORNAMEN
|
||||||
firstnames_txt = config.get("firstnames","input")
|
firstnames_txt = FILEPATH + config.get("firstnames","input")
|
||||||
path2firstnameslist = config.get("firstnames","pickle_file")
|
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
|
||||||
|
|
||||||
# STOPWORDS
|
# STOPWORDS
|
||||||
stop1 = config.get("de_stopwords","input1")
|
stop1 = FILEPATH + config.get("de_stopwords","input1")
|
||||||
stop2 = config.get("de_stopwords","input2")
|
stop2 = FILEPATH + config.get("de_stopwords","input2")
|
||||||
stop3 = config.get("de_stopwords","input3")
|
stop3 = FILEPATH + config.get("de_stopwords","input3")
|
||||||
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
6
main.py
6
main.py
|
@ -1,6 +1,6 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import time
|
||||||
import init
|
import init
|
||||||
import corporization
|
import corporization
|
||||||
import preprocessing
|
import preprocessing
|
||||||
|
@ -8,7 +8,7 @@ from miscellaneous import *
|
||||||
|
|
||||||
|
|
||||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
init.main()
|
init.main()
|
||||||
printlog("")
|
printlog("")
|
||||||
|
@ -19,3 +19,5 @@ printlog("")
|
||||||
preprocessing.main()
|
preprocessing.main()
|
||||||
printlog("")
|
printlog("")
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
printlog("Total Time Elapsed: {0} min".format((end - start) / 60))
|
||||||
|
|
|
@ -1,87 +1,25 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import random
|
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
import logging
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
import csv
|
|
||||||
import functools
|
|
||||||
import re
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import spacy
|
|
||||||
import textacy
|
|
||||||
from scipy import *
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
|
|
||||||
import time
|
|
||||||
start = time.time()
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
import csv
|
|
||||||
import functools
|
|
||||||
import re
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import spacy
|
|
||||||
import textacy
|
|
||||||
from scipy import *
|
|
||||||
import sys
|
|
||||||
csv.field_size_limit(sys.maxsize)
|
|
||||||
|
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
import enchant
|
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
|
|
||||||
import logging
|
|
||||||
|
|
||||||
import csv
|
|
||||||
import functools
|
|
||||||
import os.path
|
|
||||||
import re
|
|
||||||
import subprocess
|
|
||||||
import time
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import sys
|
|
||||||
import spacy
|
|
||||||
import textacy
|
|
||||||
from scipy import *
|
|
||||||
from textacy import Vectorizer
|
|
||||||
import warnings
|
|
||||||
import configparser as ConfigParser
|
import configparser as ConfigParser
|
||||||
import sys
|
|
||||||
import hunspell
|
|
||||||
from postal.parser import parse_address
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
from nltk.corpus import stopwords as nltk_stopwords
|
|
||||||
from collections import Counter
|
|
||||||
import csv
|
import csv
|
||||||
|
import functools
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
import re
|
import re
|
||||||
import xml.etree.ElementTree as ET
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import pickle
|
||||||
import spacy
|
import spacy
|
||||||
import textacy
|
import textacy
|
||||||
from scipy import *
|
from scipy import *
|
||||||
import sys
|
import os
|
||||||
|
|
||||||
csv.field_size_limit(sys.maxsize)
|
csv.field_size_limit(sys.maxsize)
|
||||||
import pickle
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# load config
|
# load config
|
||||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
config_ini = FILEPATH + "config.ini"
|
||||||
|
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
with open(config_ini) as f:
|
with open(config_ini) as f:
|
||||||
|
@ -90,7 +28,7 @@ with open(config_ini) as f:
|
||||||
|
|
||||||
|
|
||||||
# config logging
|
# config logging
|
||||||
filename = config.get("logging","filename")
|
filename = FILEPATH + config.get("logging","filename")
|
||||||
level = config.get("logging","level")
|
level = config.get("logging","level")
|
||||||
if level == "INFO":
|
if level == "INFO":
|
||||||
level = logging.INFO
|
level = logging.INFO
|
||||||
|
@ -188,6 +126,9 @@ def printRandomDoc(textacyCorpus):
|
||||||
:param textacyCorpus:
|
:param textacyCorpus:
|
||||||
"""
|
"""
|
||||||
print()
|
print()
|
||||||
|
if len(textacyCorpus) == 0:
|
||||||
|
printlog("NO DOCS IN CORPUS")
|
||||||
|
else:
|
||||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||||
|
@ -239,14 +180,14 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
Load textacy-Corpus including spacy-parser out from file
|
Load textacy-Corpus including spacy-parser out from file
|
||||||
:param corpus_path: str
|
:param corpus_path: str
|
||||||
:param corpus_name: str (should content the language like "_de_")
|
:param corpus_name: str (should content the language like "_de_")
|
||||||
:param lang: str language code)
|
:param lang: str (language code) ir spacy.Language
|
||||||
:return: texracy.Corpus, spacy.language
|
:return: texracy.Corpus, spacy.language
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#ckeck for language
|
#ckeck for language
|
||||||
if "_de_" in corpus_name:
|
if "de_" in corpus_name:
|
||||||
lang="de"
|
lang="de"
|
||||||
elif "_en_" in corpus_name:
|
elif "en_" in corpus_name:
|
||||||
lang ="en"
|
lang ="en"
|
||||||
|
|
||||||
|
|
||||||
|
|
163
preprocessing.py
163
preprocessing.py
|
@ -1,76 +1,23 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
print(datetime.now())
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
from stop_words import get_stop_words
|
|
||||||
|
|
||||||
#import words as words
|
|
||||||
from nltk.corpus import stopwords as nltk_stopwords
|
|
||||||
from collections import Counter
|
|
||||||
import csv
|
import csv
|
||||||
import re
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import spacy
|
|
||||||
import textacy
|
|
||||||
from scipy import *
|
|
||||||
import sys
|
import sys
|
||||||
csv.field_size_limit(sys.maxsize)
|
|
||||||
import pickle
|
|
||||||
import configparser as ConfigParser
|
|
||||||
from miscellaneous import *
|
from miscellaneous import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import logging
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
import csv
|
|
||||||
import functools
|
|
||||||
import re
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import spacy
|
|
||||||
import textacy
|
|
||||||
from scipy import *
|
|
||||||
import sys
|
|
||||||
csv.field_size_limit(sys.maxsize)
|
|
||||||
|
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import logging
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
import csv
|
|
||||||
import functools
|
|
||||||
import re
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import spacy
|
|
||||||
import textacy
|
import textacy
|
||||||
from scipy import *
|
from scipy import *
|
||||||
import sys
|
|
||||||
|
import os
|
||||||
|
|
||||||
csv.field_size_limit(sys.maxsize)
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||||
|
|
||||||
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# load config
|
# load config
|
||||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
config_ini = FILEPATH + "config.ini"
|
||||||
|
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
with open(config_ini) as f:
|
with open(config_ini) as f:
|
||||||
|
@ -178,7 +125,6 @@ def getFirstSynonym(word, thesaurus=THESAURUS):
|
||||||
# http://wortschatz.uni-leipzig.de/en/download
|
# http://wortschatz.uni-leipzig.de/en/download
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from collections import Counter
|
|
||||||
|
|
||||||
|
|
||||||
def words(text): return re.findall(r'\w+', text.lower())
|
def words(text): return re.findall(r'\w+', text.lower())
|
||||||
|
@ -250,7 +196,7 @@ def stringcleaning(stringstream):
|
||||||
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
|
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
|
||||||
|
|
||||||
# cut_after
|
# cut_after
|
||||||
word = "gruss"
|
word = "gruss" #idee addressen enfernen --> postal.parser
|
||||||
string = string.rpartition(word)[0] if word in string else string
|
string = string.rpartition(word)[0] if word in string else string
|
||||||
|
|
||||||
# lemmatize
|
# lemmatize
|
||||||
|
@ -347,29 +293,23 @@ def processDictstream(dictstream, funcdict, parser):
|
||||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
||||||
|
|
||||||
|
|
||||||
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
|
||||||
|
|
||||||
path2wordsdict = config.get("spellchecking", "pickle_file")
|
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
|
||||||
|
|
||||||
path2lemmadict = config.get("lemmatization","pickle_file")
|
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
|
||||||
|
|
||||||
path2nouns_list = config.get("nouns","pickle_file")
|
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
|
||||||
|
|
||||||
path2firstnameslist = config.get("firstnames","pickle_file")
|
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
|
||||||
|
|
||||||
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
corpus_de_path = config.get("de_corpus", "path")
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
raw_de_name = config.get("de_corpus", "raw")
|
|
||||||
pre_de_name = config.get("de_corpus", "pre")
|
|
||||||
|
|
||||||
|
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||||
|
|
||||||
corpus_en_path = config.get("en_corpus", "path")
|
|
||||||
raw_en_name = config.get("en_corpus", "raw")
|
|
||||||
pre_en_name = config.get("en_corpus", "pre")
|
|
||||||
|
|
||||||
|
|
||||||
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
||||||
|
@ -383,10 +323,8 @@ custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanke
|
||||||
"funktionieren", "kollege", "pruefen", "hoffen"
|
"funktionieren", "kollege", "pruefen", "hoffen"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
filter_tokens = [
|
filter_tokens = [
|
||||||
# removeENT(["PERSON"]),
|
# removeENT(["PERSON"]),
|
||||||
# idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
|
||||||
|
|
||||||
keepNouns(),
|
keepNouns(),
|
||||||
|
|
||||||
|
@ -403,8 +341,8 @@ filter_tokens = [
|
||||||
|
|
||||||
|
|
||||||
]
|
]
|
||||||
|
#todo filtertokens haut alle raus
|
||||||
|
filter_tokens = None
|
||||||
|
|
||||||
clean_in_meta = {
|
clean_in_meta = {
|
||||||
"Solution": [removePOS(["SPACE"])],
|
"Solution": [removePOS(["SPACE"])],
|
||||||
|
@ -412,17 +350,43 @@ clean_in_meta = {
|
||||||
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
|
||||||
|
|
||||||
|
printlog("Preprocess {0}_corpus at {1}".format(lang,datetime.now()))
|
||||||
|
|
||||||
|
rawCorpus_name = lang + "_raw_ticket"
|
||||||
|
preCorpus_name = lang + "_pre_ticket"
|
||||||
|
|
||||||
|
#load raw corpus and create new one
|
||||||
|
raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)
|
||||||
|
|
||||||
|
corpus = textacy.Corpus(parser)
|
||||||
|
|
||||||
|
|
||||||
|
## process and add files to textacy-corpi,
|
||||||
|
corpus.add_texts(
|
||||||
|
processContentstream(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser),
|
||||||
|
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# leere docs aus corpi kicken
|
||||||
|
corpus.remove(lambda doc: len(doc) == 0)
|
||||||
|
|
||||||
|
|
||||||
|
for i in range(printrandom):
|
||||||
|
printRandomDoc(corpus)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#save corpus
|
||||||
|
save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
start = time.time()
|
start = time.time()
|
||||||
printlog("Preprocessing: {0}".format(datetime.now()))
|
|
||||||
|
|
||||||
|
|
||||||
THESAURUS = load_obj(path2thesaurus_dict)
|
THESAURUS = load_obj(path2thesaurus_dict)
|
||||||
|
@ -432,44 +396,9 @@ def main():
|
||||||
NOUNS = load_obj(path2nouns_list)
|
NOUNS = load_obj(path2nouns_list)
|
||||||
VORNAMEN = load_obj(path2firstnameslist)
|
VORNAMEN = load_obj(path2firstnameslist)
|
||||||
|
|
||||||
|
preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" )
|
||||||
|
|
||||||
|
preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
|
||||||
#load raw corpus and create new one
|
|
||||||
raw_de_corpus, DE_PARSER = load_corpus(corpus_name=raw_de_name, corpus_path=corpus_de_path)
|
|
||||||
raw_en_corpus, EN_PARSER = load_corpus(corpus_name=raw_en_name, corpus_path=corpus_en_path)
|
|
||||||
|
|
||||||
de_corpus = textacy.Corpus(DE_PARSER)
|
|
||||||
en_corpus = textacy.Corpus(EN_PARSER)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## process and add files to textacy-corpi,
|
|
||||||
printlog("Preprocess and add texts to textacy-corpi")
|
|
||||||
de_corpus.add_texts(
|
|
||||||
processContentstream(corpus2Text(raw_de_corpus), token_filterlist=filter_tokens, parser=DE_PARSER),
|
|
||||||
processDictstream(corpus2Meta(raw_de_corpus), clean_in_meta,parser=raw_de_corpus.lang)
|
|
||||||
)
|
|
||||||
en_corpus.add_texts(
|
|
||||||
processContentstream(corpus2Text(raw_en_corpus), token_filterlist=filter_tokens, parser=EN_PARSER),
|
|
||||||
processDictstream(corpus2Meta(raw_en_corpus), clean_in_meta,parser=raw_en_corpus.lang)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# leere docs aus corpi kicken
|
|
||||||
de_corpus.remove(lambda doc: len(doc) == 0)
|
|
||||||
en_corpus.remove(lambda doc: len(doc) == 0)
|
|
||||||
|
|
||||||
|
|
||||||
for i in range(20):
|
|
||||||
printRandomDoc(de_corpus)
|
|
||||||
#printRandomDoc(en_corpus)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#save corpi
|
|
||||||
save_corpus(corpus=de_corpus, corpus_path=corpus_de_path, corpus_name=pre_de_name)
|
|
||||||
save_corpus(corpus=en_corpus, corpus_path=corpus_en_path, corpus_name=pre_en_name)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
52
testra.py
52
testra.py
|
@ -1,10 +1,11 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import spacy
|
#import spacy
|
||||||
import textacy
|
#import textacy
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
@ -15,7 +16,6 @@ from datetime import datetime
|
||||||
|
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
print(datetime.now())
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
PARSER=spacy.load("de")
|
PARSER=spacy.load("de")
|
||||||
|
@ -55,19 +55,8 @@ print(corpi)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
def save_obj(obj, path):
|
|
||||||
with open(path + '.pkl', 'wb') as f:
|
|
||||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
def load_obj(path ):
|
|
||||||
with open(path + '.pkl', 'rb') as f:
|
|
||||||
return pickle.load(f)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -95,7 +84,7 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
corpus.add_doc(
|
corpus.add_doc(
|
||||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||||
return corpus
|
return corpus
|
||||||
|
"""
|
||||||
import os
|
import os
|
||||||
a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt"
|
a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt"
|
||||||
b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt"
|
b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt"
|
||||||
|
@ -103,37 +92,16 @@ d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.
|
||||||
|
|
||||||
c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt"
|
c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt"
|
||||||
|
|
||||||
liste = [a,b,c,d]
|
|
||||||
de_filepaths = [path for path in liste if os.path.basename(path).split("_")[0]=='de' and os.path.basename(path).split("_")[1]=='stopwords']
|
|
||||||
|
|
||||||
from nltk.corpus import stopwords as nltk_stopwords
|
|
||||||
|
|
||||||
from stop_words import get_stop_words
|
|
||||||
import spacy
|
|
||||||
from miscellaneous import *
|
|
||||||
# from packages
|
|
||||||
de_stop_words1 = list(get_stop_words("de"))
|
|
||||||
|
|
||||||
de_stop_words2 = list(nltk_stopwords.words('german'))
|
|
||||||
|
|
||||||
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
|
|
||||||
|
|
||||||
# from files
|
|
||||||
de_stop_words_list = [list(textacy.fileio.read_file_lines(path)) for path in de_filepaths]
|
|
||||||
de_stop_words4 = [item for sublist in de_stop_words_list for item in sublist]
|
|
||||||
#print(de_stop_words4)
|
|
||||||
|
|
||||||
de_stop_words = list(set(map(replaceRockDots(),list(map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
|
|
||||||
print(len(de_stop_words))
|
|
||||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testra.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_testra.log &"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
scriptpath = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
# THESAURUS
|
# THESAURUS
|
||||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
||||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||||
|
@ -212,7 +180,7 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||||
pass
|
pass
|
||||||
return thesaurus
|
return thesaurus
|
||||||
|
|
||||||
"""
|
|
||||||
for r in synroot:
|
for r in synroot:
|
||||||
for element in r:
|
for element in r:
|
||||||
|
|
||||||
|
@ -223,10 +191,9 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||||
|
|
||||||
if id not in synset2Words.keys():
|
if id not in synset2Words.keys():
|
||||||
synset2Words[id] = "WORD"
|
synset2Words[id] = "WORD"
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
from postal.parser import parse_address
|
from postal.parser import parse_address
|
||||||
|
|
||||||
|
@ -557,6 +524,7 @@ de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
|
||||||
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
|
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
||||||
|
|
||||||
|
|
|
@ -94,9 +94,6 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
|
||||||
printlog("doc_term_matrix: {0}".format(doc_term_matrix))
|
printlog("doc_term_matrix: {0}".format(doc_term_matrix))
|
||||||
printlog("id2term: {0}".format(id2term))
|
printlog("id2term: {0}".format(id2term))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||||
corpus_name = "de_corpus"
|
corpus_name = "de_corpus"
|
||||||
|
|
||||||
|
@ -104,11 +101,6 @@ corpus_name = "de_corpus"
|
||||||
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
|
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
|
||||||
|
|
||||||
|
|
||||||
for i in range(5):
|
|
||||||
printRandomDoc(de_corpus)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# todo gescheites tf(-idf) maß finden
|
# todo gescheites tf(-idf) maß finden
|
||||||
ngrams = 1
|
ngrams = 1
|
||||||
|
@ -128,6 +120,7 @@ printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
|
||||||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
|
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# build citionary of ticketcategories
|
# build citionary of ticketcategories
|
||||||
labelist = []
|
labelist = []
|
||||||
|
|
||||||
|
@ -139,6 +132,7 @@ LABELDICT = {k: v for v, k in enumerate(labelist)}
|
||||||
printlog(str("LABELDICT: {0}".format(LABELDICT)))
|
printlog(str("LABELDICT: {0}".format(LABELDICT)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False,
|
def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False,
|
||||||
corpus=de_corpus):
|
corpus=de_corpus):
|
||||||
printlog(
|
printlog(
|
||||||
|
|
Loading…
Reference in New Issue