eigene corpus-save/load methoden geschreiben

This commit is contained in:
jannis.grundmann 2017-10-09 12:50:34 +02:00
parent 3bfbebc894
commit b542c4285a
3 changed files with 127 additions and 11 deletions

View File

@ -7,8 +7,8 @@ print(datetime.now())
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
path_csv_split = path2csv.split("/")
print(path_csv_split[len(path_csv_split)-1])
@ -501,7 +501,7 @@ def build_thesaurus(path2lexicalentries, path2synsets):
THESAURUS=[]
#THESAURUS=build_thesaurus(path2lexicalentries=lexicalentries,path2synsets=synsets) #todo anschalten
THESAURUS=build_thesaurus(path2lexicalentries=lexicalentries,path2synsets=synsets)
def getFirstSynonym(word, thesaurus=THESAURUS):
if not isinstance(word, str):
@ -541,7 +541,7 @@ def words(text): return re.findall(r'\w+', text.lower())
WORDS={}
#WORDS = Counter(words(open(path2words).read())) #todo anschalten
WORDS = Counter(words(open(path2words).read()))
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
@ -629,10 +629,10 @@ def stringcleaning(stringstream):
string = " ".join([lemmatizeWord(word) for word in string.split()])
# synonyme normalisieren #idee vor oder nach lemmatize?
#string = " ".join([getFirstSynonym(word) for word in string.split()])
string = " ".join([getFirstSynonym(word) for word in string.split()])
# autocorrect
#string = " ".join([autocorrectWord(word) for word in string.split()])
string = " ".join([autocorrectWord(word) for word in string.split()])
yield string
@ -745,7 +745,7 @@ custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","vora
"auffahren","vorgang","hinweis","institut","universitaet","name","gruss","id","erfolg","mail","folge",
"nummer","team","fakultaet","email","absender","tu","versenden","vorname","message",
"service","strasse","prozess","portal","raum","personal","moeglichkeit","fremd","wende","rueckfrage", "stehen", "verfuegung"
"funktionieren","kollege", "pruefen"
"funktionieren","kollege", "pruefen","hoffen"
]
@ -896,9 +896,15 @@ weighting = 'tf'
# weighting ='tfidf'
named_entities = False
#printvecotorization(ngrams=ngrams,min_df=min_df,max_df=max_df,weighting=weighting,named_entities=named_entities)
printvecotorization(ngrams=1,min_df=1,max_df=1.0,weighting=weighting)
printvecotorization(ngrams=1,min_df=1,max_df=0.5,weighting=weighting)
printvecotorization(ngrams=1,min_df=1,max_df=0.8,weighting=weighting)
printvecotorization(ngrams=(1,2),min_df=1,max_df=1.0,weighting=weighting)
printvecotorization(ngrams=(1,2),min_df=1,max_df=0.5,weighting=weighting)
printvecotorization(ngrams=(1,2),min_df=1,max_df=0.8,weighting=weighting)
"""

116
testra.py
View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import re
import time
import json
import spacy
import textacy
@ -16,7 +17,116 @@ import xml.etree.ElementTree as ET
print(datetime.now())
#PARSER=spacy.load("de")
PARSER=spacy.load("de")
corpus = textacy.Corpus(PARSER)
testcontetn = [
"fdsfdsfsd",
"juzdtjlkö",
"gfadojplk"
]
testmetda = [
{"categoryName":"zhb","Solution":"","Subject":"schulungstest"},
{"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"},
{"categoryName":"zhb","Solution":"","Subject":"setuji"}
]
def makecontent(testcontetn):
for content in testcontetn:
yield content
def makemeta( testmetda):
for metdata in testmetda:
yield metdata
corpus.add_texts(
makecontent(testcontetn),
makemeta(testmetda)
)
print(corpus)
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "testcorpus"
"""
#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
#corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
import pathlib
strings_path = pathlib.Path(corpus_path + 'strings.json')
path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')
PARSER.vocab.dump(path_lexemes_bin_)
nlp.vocab.load_lexemes(path_lexemes_bin_)
"""
def save_corpus(corpus_path,corpus_name):
# save stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path, "w") as file:
PARSER.vocab.strings.dump(file)
#save content
contentpath = corpus_path + corpus_name+ "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus),contentpath)
#save meta
metapath = corpus_path + corpus_name +"_meta.json"
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
def load_corpus(corpus_path,corpus_name):
# load new lang
nlp = spacy.load("de")
#load stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path,"r") as file:
nlp.vocab.strings.load(file)
# define corpus
corpus = textacy.Corpus(nlp)
# load meta
metapath = corpus_path + corpus_name +"_meta.json"
metadata_stream = textacy.fileio.read_json_lines(metapath)
#load content
contentpath = corpus_path + corpus_name+ "_content.bin"
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
save_corpus(corpus_path,corpus_name)
print(load_corpus(corpus_path,corpus_name))
#idee das auch mit spellchecker, lemmetaizer und thesaurus machen wegen memory
# todo generators immer neu laden? wegen laufzeit-nacheinander-picking, denn sonst nicht det
"""
@ -97,7 +207,7 @@ for r in root:
"""
import re
from collections import Counter
@ -135,7 +245,7 @@ def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
"""