148 lines
4.0 KiB
Python
148 lines
4.0 KiB
Python
|
import csv
|
||
|
import sys
|
||
|
import os
|
||
|
import pymongo
|
||
|
|
||
|
# Definitions
|
||
|
TYPE_NOUNS = 0
|
||
|
TYPE_VERBS = 1
|
||
|
TYPE_ADJECTIVES = 2
|
||
|
|
||
|
PATH_TO_VOCAB = "../data/vocab"
|
||
|
|
||
|
def preprocess_row(row):
|
||
|
return row[1:]
|
||
|
|
||
|
def genus_to_datatype(gen):
|
||
|
if (gen == "m"): return "Maskulin"
|
||
|
if (gen == "w"): return "Feminin"
|
||
|
if (gen == "n"): return "Neutrum"
|
||
|
|
||
|
def log(msg, err=False, tabs=0):
|
||
|
if (not err):
|
||
|
print("[*] " + "\t" * tabs + msg)
|
||
|
else:
|
||
|
print("[X] " + "\t" * tabs + msg)
|
||
|
|
||
|
def dbg(msg, tabs=0):
|
||
|
print("[D] " + "\t" * tabs + msg)
|
||
|
|
||
|
def csv_to_vocab(filename, type, from_id):
|
||
|
id = from_id + 1
|
||
|
vocab = []
|
||
|
skip = 0
|
||
|
path = os.path.join(PATH_TO_VOCAB, filename)
|
||
|
dbg("Reading from {0} ({1})".format(filename, path), tabs=2)
|
||
|
with open(path, newline="") as csvfile:
|
||
|
reader = csv.reader(csvfile, delimiter=",", quotechar="\"")
|
||
|
for raw in reader:
|
||
|
skip += 1
|
||
|
# Skip the header lines
|
||
|
# if (skip < num_lines_to_skip + 1):
|
||
|
if (skip < 4):
|
||
|
continue
|
||
|
|
||
|
# The nouns are special
|
||
|
row = preprocess_row(raw) if type == TYPE_NOUNS else raw
|
||
|
|
||
|
grundform = row[0]
|
||
|
hint = ""
|
||
|
mnemonic = ""
|
||
|
|
||
|
latin = {
|
||
|
"grundform": grundform
|
||
|
}
|
||
|
|
||
|
# The parsing depends on the type of word we're dealing with
|
||
|
bedeutungen = []
|
||
|
if (type == TYPE_NOUNS):
|
||
|
# Nomen
|
||
|
genitiv = row[1]
|
||
|
genus = genus_to_datatype(row[2])
|
||
|
bedeutungen = [row[3]]
|
||
|
if (row[4] != ""):
|
||
|
bedeutungen.append(row[4])
|
||
|
if (row[5] != ""):
|
||
|
bedeutungen.append(row[5])
|
||
|
|
||
|
latin["genitiv"] = genitiv
|
||
|
latin["genus"] = genus
|
||
|
elif (type == TYPE_VERBS):
|
||
|
# Verb
|
||
|
praesens = row[1]
|
||
|
perfekt = row[2]
|
||
|
bedeutungen = [row[3]]
|
||
|
if (row[4] != ""):
|
||
|
bedeutungen.append(row[4])
|
||
|
if (row[5] != ""):
|
||
|
bedeutungen.append(row[5])
|
||
|
|
||
|
latin["praesens"] = praesens
|
||
|
latin["perfekt"] = perfekt
|
||
|
latin["ppp"] = ""
|
||
|
elif (type == TYPE_ADJECTIVES):
|
||
|
# Adjektiv
|
||
|
endung_f = row[1]
|
||
|
endung_n = row[2]
|
||
|
bedeutungen = [row[3]]
|
||
|
if (row[4] != ""):
|
||
|
bedeutungen.append(row[4])
|
||
|
if (row[5] != ""):
|
||
|
bedeutungen.append(row[5])
|
||
|
|
||
|
latin["nominativ_a"] = endung_f
|
||
|
latin["nominativ_b"] = endung_n
|
||
|
|
||
|
# TODO: Hints and mnemonics
|
||
|
vocab.append({
|
||
|
"id": id,
|
||
|
"german": bedeutungen,
|
||
|
"hint": hint,
|
||
|
"mnemonic": mnemonic,
|
||
|
"type": type,
|
||
|
"latin": latin
|
||
|
})
|
||
|
id += 1
|
||
|
return vocab, id
|
||
|
|
||
|
log("Lateinicus CSV to Vocabulary DB Model")
|
||
|
if (len(sys.argv) < 3):
|
||
|
log("Not enough arguments!", err=True)
|
||
|
log("Usage: csv_vocab_to_mongo.py <URI> <Database>", err=True)
|
||
|
sys.exit(1)
|
||
|
|
||
|
log("Generating vocabulary")
|
||
|
id = 0
|
||
|
vocab = []
|
||
|
# Nouns
|
||
|
log("Nouns...", tabs=1)
|
||
|
nouns, last_id = csv_to_vocab("Nomen.csv", TYPE_NOUNS, 0)
|
||
|
vocab += nouns
|
||
|
id = last_id
|
||
|
|
||
|
log("Verbs...", tabs=1)
|
||
|
verbs, last_id = csv_to_vocab("Verben.csv", TYPE_VERBS, last_id)
|
||
|
vocab += verbs
|
||
|
id = last_id
|
||
|
|
||
|
log("Adjectives...", tabs=1)
|
||
|
adj, last_id = csv_to_vocab("Adjektive.csv", TYPE_ADJECTIVES, last_id)
|
||
|
vocab += adj
|
||
|
id = last_id
|
||
|
|
||
|
sys.exit(1)
|
||
|
|
||
|
# Connect to the database
|
||
|
log("Inserting vocabulary into database")
|
||
|
log("Connecting...", tabs=1)
|
||
|
client = pymongo.MongoClient(sys.argv[1])
|
||
|
log("Getting DB...", tabs=1)
|
||
|
db = client[sys.argv[2]]
|
||
|
log("Inserting...", tabs=1)
|
||
|
res = db["vocabulary"].insert_many(vocab)
|
||
|
|
||
|
if (len(res.inserted_ids) != len(vocab)):
|
||
|
log("Not enough elements were added to the database", err=True, tabs=1)
|
||
|
else:
|
||
|
log("Success", tabs=1)
|