247 lines
5.3 KiB
Python
247 lines
5.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
import re
|
|
import time
|
|
|
|
import textacy
|
|
|
|
start = time.time()
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
print(datetime.now())
|
|
|
|
|
|
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"
|
|
|
|
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
|
root = tree.getroot()
|
|
|
|
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
|
|
|
nomen=[]
|
|
|
|
|
|
|
|
|
|
### extract from derewo
|
|
|
|
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
|
|
|
|
|
|
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
|
|
|
|
for line in raw:
|
|
line_list=line.split()
|
|
if line_list[2] == "NN":
|
|
string = line_list[1].lower()
|
|
|
|
# replaceRockDots
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
|
|
nomen.append(string.lower().strip())
|
|
|
|
|
|
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
### extract from deWordNet.xml
|
|
|
|
#https://github.com/hdaSprachtechnologie/odenet
|
|
|
|
for r in root:
|
|
for element in r:
|
|
|
|
if element.tag == "LexicalEntry":
|
|
for i,subentry in enumerate(element):
|
|
if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":
|
|
string = (subentry.attrib["writtenForm"])
|
|
# replaceRockDots
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
# seperate_words_on_regex:
|
|
string = " ".join(re.compile(regex_specialChars).split(string))
|
|
string_list=string.split()
|
|
if len(string_list) == 1:
|
|
nomen.append(string.lower().strip())
|
|
|
|
|
|
textacy.fileio.write_file_lines(nomen,"nomen.txt")
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
|
|
content_collumn_name = "Description"
|
|
content_collumn = 9 # standardvalue
|
|
|
|
de_tickets=[]
|
|
en_tickets=[]
|
|
misc_tickets=[]
|
|
|
|
error_count = 0
|
|
for i, lst in enumerate(stream):
|
|
if i == 0:
|
|
de_tickets.append(lst)
|
|
en_tickets.append(lst)
|
|
misc_tickets.append(lst)
|
|
else:
|
|
try:
|
|
content_collumn_ = lst[content_collumn]
|
|
if detect(content_collumn_) == "de":
|
|
de_tickets.append(lst)
|
|
elif detect(content_collumn_) == "en":
|
|
en_tickets.append(lst)
|
|
else:
|
|
misc_tickets.append(lst)
|
|
|
|
except:
|
|
misc_tickets.append(lst)
|
|
error_count += 1
|
|
|
|
print(error_count)
|
|
|
|
textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
|
|
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
|
|
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
"""
|
|
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
|
|
|
|
|
def stringcleaning(stringstream, funclist):
|
|
for string in stringstream:
|
|
for f in funclist:
|
|
string = f(string)
|
|
yield string
|
|
|
|
|
|
def seperate_words_on_regex(regex=regex_specialChars):
|
|
return lambda string: " ".join(re.compile(regex).split(string))
|
|
|
|
|
|
words = [
|
|
"uniaccount",
|
|
"nr54065467",
|
|
"nr54065467",
|
|
"455a33c5,"
|
|
"tvt?=",
|
|
"tanja.saborowski@tu-dortmund.de",
|
|
"-",
|
|
"m-sw1-vl4053.itmc.tu-dortmund.de",
|
|
"------problem--------"
|
|
]
|
|
|
|
|
|
|
|
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
|
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
|
|
|
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
|
|
print(s.strip())
|
|
|
|
#print(stringcleaning(w,string_comp))
|
|
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
|
|
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
|
|
#result = specialFinder.sub(" ", w)
|
|
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
|
|
|
|
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
|
|
"""
|
|
|
|
|
|
|
|
"""
|
|
def replaceRockDots():
|
|
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
|
|
|
|
|
|
|
de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))
|
|
|
|
|
|
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
|
|
|
|
#print(blob.entities)
|
|
|
|
de_stop_words = list(map(replaceRockDots(),de_stop_words))
|
|
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
|
|
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
|
|
|
|
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
|
|
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
|
|
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
|
|
|
|
|
|
|
|
|
|
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
|
|
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
|
|
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
|
|
|
|
"""
|
|
end = time.time()
|
|
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
|
|