This repository has been archived on 2022-03-12. You can view files and clone it, but cannot push or open issues or pull requests.
Lateinicus/scripts/csv_vocab_to_mongo.py

159 lines
4.4 KiB
Python
Raw Normal View History

import csv
import sys
import os
import pymongo
# Definitions
TYPE_NOUNS = 0
TYPE_VERBS = 1
TYPE_ADJECTIVES = 2
2018-10-02 13:49:53 +00:00
TYPE_ADVERB = 3
PATH_TO_VOCAB = "../data/vocab"
def preprocess_row(row):
return row[1:]
def genus_to_datatype(gen):
if (gen == "m"): return "Maskulin"
if (gen == "f"): return "Feminin"
if (gen == "n"): return "Neutrum"
def log(msg, err=False, tabs=0):
if (not err):
print("[*] " + "\t" * tabs + msg)
else:
print("[X] " + "\t" * tabs + msg)
def dbg(msg, tabs=0):
print("[D] " + "\t" * tabs + msg)
def csv_to_vocab(filename, type, from_id):
id = from_id + 1
vocab = []
skip = 0
path = os.path.join(PATH_TO_VOCAB, filename)
dbg("Reading from {0} ({1})".format(filename, path), tabs=2)
with open(path, newline="") as csvfile:
reader = csv.reader(csvfile, delimiter=",", quotechar="\"")
for raw in reader:
skip += 1
# Skip the header lines
# if (skip < num_lines_to_skip + 1):
if (skip < 4):
continue
# The nouns are special
row = preprocess_row(raw) if type == TYPE_NOUNS else raw
grundform = row[0]
hint = ""
mnemonic = ""
latin = {
"grundform": grundform
}
# The parsing depends on the type of word we're dealing with
bedeutungen = []
if (type == TYPE_NOUNS):
# Nomen
genitiv = row[1]
genus = genus_to_datatype(row[2])
bedeutungen = [row[3]]
if (row[4] != ""):
bedeutungen.append(row[4])
if (row[5] != ""):
bedeutungen.append(row[5])
latin["genitiv"] = genitiv
latin["genus"] = genus
elif (type == TYPE_VERBS):
# Verb
praesens = row[1]
perfekt = row[2]
bedeutungen = [row[3]]
if (row[4] != ""):
bedeutungen.append(row[4])
if (row[5] != ""):
bedeutungen.append(row[5])
latin["praesens"] = praesens
latin["perfekt"] = perfekt
latin["ppp"] = ""
elif (type == TYPE_ADJECTIVES):
# Adjektiv
endung_f = row[1]
endung_n = row[2]
bedeutungen = [row[3]]
if (row[4] != ""):
bedeutungen.append(row[4])
if (row[5] != ""):
bedeutungen.append(row[5])
2018-10-02 11:10:10 +00:00
latin["endung_f"] = endung_f
latin["endung_n"] = endung_n
2018-10-02 13:49:53 +00:00
elif (type == TYPE_ADVERB):
# Adverb
bedeutungen = [row[1]]
if (row[2] != ""):
bedeutungen.append(row[2])
if (row[3] != ""):
bedeutungen.append(row[3])
# TODO: Hints and mnemonics
vocab.append({
"id": id,
"german": bedeutungen,
"hint": hint,
"mnemonic": mnemonic,
"type": type,
"latin": latin
})
id += 1
return vocab, id
log("Lateinicus CSV to Vocabulary DB Model")
if (len(sys.argv) < 3):
log("Not enough arguments!", err=True)
log("Usage: csv_vocab_to_mongo.py <URI> <Database>", err=True)
sys.exit(1)
log("Generating vocabulary")
id = 0
vocab = []
# Nouns
log("Nouns...", tabs=1)
nouns, last_id = csv_to_vocab("Nomen.csv", TYPE_NOUNS, 0)
vocab += nouns
id = last_id
log("Verbs...", tabs=1)
verbs, last_id = csv_to_vocab("Verben.csv", TYPE_VERBS, last_id)
vocab += verbs
id = last_id
log("Adjectives...", tabs=1)
adj, last_id = csv_to_vocab("Adjektive.csv", TYPE_ADJECTIVES, last_id)
vocab += adj
id = last_id
2018-10-02 13:49:53 +00:00
log("Adverbs...", tabs=1)
adj, last_id = csv_to_vocab("Adverbien.csv", TYPE_ADVERBS, last_id)
vocab += adj
# Connect to the database
log("Inserting vocabulary into database")
log("Connecting...", tabs=1)
client = pymongo.MongoClient(sys.argv[1])
log("Getting DB...", tabs=1)
db = client[sys.argv[2]]
log("Inserting...", tabs=1)
res = db["vocabulary"].insert_many(vocab)
if (len(res.inserted_ids) != len(vocab)):
log("Not enough elements were added to the database", err=True, tabs=1)
else:
log("Success", tabs=1)