topicModelingTickets/testra.py

149 lines
3.7 KiB
Python

# -*- coding: utf-8 -*-
import time
start = time.time()
import corenlp as corenlp
import os
import re
import spacy
import textacy
import nltk
from textblob_de import TextBlobDE
from textblob_de import PatternParser
#from polyglot.text import Text
import hunspell
from postal.parser import parse_address
import langdetect
import enchant
#todo ticket.csv aufteilen in de und en
#print(parse_address(str(textacy.fileio.read_file("teststring.txt"))))
from langdetect import detect
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
content_collumn_name = "Description"
content_collumn = 9 # standardvalue
de_tickets=[]
en_tickets=[]
misc_tickets=[]
error_count = 0
for i, lst in enumerate(stream):
if i == 0:
de_tickets.append(lst)
en_tickets.append(lst)
misc_tickets.append(lst)
else:
try:
content_collumn_ = lst[content_collumn]
if detect(content_collumn_) == "de":
de_tickets.append(lst)
elif detect(content_collumn_) == "en":
en_tickets.append(lst)
else:
misc_tickets.append(lst)
except:
misc_tickets.append(lst)
error_count += 1
print(error_count)
textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
"""
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
def stringcleaning(stringstream, funclist):
for string in stringstream:
for f in funclist:
string = f(string)
yield string
def seperate_words_on_regex(regex=regex_specialChars):
return lambda string: " ".join(re.compile(regex).split(string))
words = [
"uniaccount",
"nr54065467",
"nr54065467",
"455a33c5,"
"tvt?=",
"tanja.saborowski@tu-dortmund.de",
"-",
"m-sw1-vl4053.itmc.tu-dortmund.de",
"------problem--------"
]
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
print(s.strip())
#print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
"""
"""
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
#print(blob.entities)
de_stop_words = list(map(replaceRockDots(),de_stop_words))
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
"""
end = time.time()
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))