This repository has been archived on 2022-03-12. You can view files and clone it, but cannot push or open issues or pull requests.
Lateinicus/scripts/csv_vocab_to_mongo.py

178 lines
5.0 KiB
Python
Raw Normal View History

import csv
import sys
import os
import pymongo
# Definitions
TYPE_NOUNS = 0
TYPE_VERBS = 1
TYPE_ADJECTIVES = 2
TYPE_ADVERBS = 3
PATH_TO_VOCAB = "../data/vocab"
def preprocess_row(row):
return row[1:]
def genus_to_datatype(gen):
if (gen == "m"): return "Maskulin"
if (gen == "f"): return "Feminin"
if (gen == "n"): return "Neutrum"
def log(msg, err=False, tabs=0):
if (not err):
print("[*] " + "\t" * tabs + msg)
else:
print("[X] " + "\t" * tabs + msg)
def dbg(msg, tabs=0):
print("[D] " + "\t" * tabs + msg)
def csv_to_vocab(filename, type, from_id):
id = from_id + 1
vocab = []
skip = 0
path = os.path.join(PATH_TO_VOCAB, filename)
dbg("Reading from {0} ({1})".format(filename, path), tabs=2)
with open(path, newline="") as csvfile:
reader = csv.reader(csvfile, delimiter=",", quotechar="\"")
for raw in reader:
skip += 1
# Skip the header lines
# if (skip < num_lines_to_skip + 1):
if (skip < 4):
continue
# Remove any whitespaces in front of or after the string
2018-10-21 20:44:20 +00:00
row = [col.strip() for col in raw]
grundform = row[0]
hint = ""
mnemonic = ""
latin = {
"grundform": grundform
}
# The parsing depends on the type of word we're dealing with
bedeutungen = []
if (type == TYPE_NOUNS):
# Nomen
genitiv = row[1]
genus = genus_to_datatype(row[2])
bedeutungen = [row[3]]
if (row[4] != ""):
bedeutungen.append(row[4])
if (row[5] != ""):
bedeutungen.append(row[5])
2018-10-21 20:44:20 +00:00
if (row[6] != ""):
bedeutungen.append(row[6])
2018-10-21 20:44:20 +00:00
if (row[7] != ""):
mnemonic = row[7]
latin["genitiv"] = genitiv
latin["genus"] = genus
elif (type == TYPE_VERBS):
# Verb
praesens = row[1]
2018-10-21 20:44:20 +00:00
bedeutungen = [row[2]]
if (row[3] != ""):
bedeutungen.append(row[3])
if (row[4] != ""):
bedeutungen.append(row[4])
2018-10-21 20:44:20 +00:00
if (row[6] != ""):
mnemonic = row[6]
latin["praesens"] = praesens
2018-10-21 20:44:20 +00:00
#latin["perfekt"] = perfekt
elif (type == TYPE_ADJECTIVES):
# Adjektiv
endung_f = row[1]
endung_n = row[2]
bedeutungen = [row[3]]
if (row[4] != ""):
bedeutungen.append(row[4])
if (row[5] != ""):
bedeutungen.append(row[5])
2018-10-21 20:44:20 +00:00
if (row[6] != ""):
mnemonic = row[6]
2018-10-02 11:10:10 +00:00
latin["endung_f"] = endung_f
latin["endung_n"] = endung_n
elif (type == TYPE_ADVERBS):
2018-10-02 13:49:53 +00:00
# Adverb
bedeutungen = [row[1]]
if (row[2] != ""):
bedeutungen.append(row[2])
if (row[3] != ""):
bedeutungen.append(row[3])
2018-10-21 20:44:20 +00:00
if (row[4] != ""):
bedeutungen.append(row[4])
if (row[5] != ""):
mnemonic = row[5]
if mnemonic != "":
print("Found mnemonic for:", latin["grundform"])
vocab.append({
"id": id,
"german": bedeutungen,
"hint": hint,
"mnemonic": mnemonic,
"type": type,
"latin": latin
})
id += 1
return vocab, id
log("Lateinicus CSV to Vocabulary DB Model")
2018-10-11 15:25:19 +00:00
if len(sys.argv) < 3 and os.getenv("DEBUG") == None:
log("Not enough arguments!", err=True)
log("Usage: csv_vocab_to_mongo.py <URI> <Database>", err=True)
sys.exit(1)
log("Generating vocabulary")
id = 0
vocab = []
2018-10-02 15:56:18 +00:00
log("Nouns...", tabs=1)
nouns, last_id = csv_to_vocab("Nomen.csv", TYPE_NOUNS, 0)
vocab += nouns
id = last_id
log("Verbs...", tabs=1)
verbs, last_id = csv_to_vocab("Verben.csv", TYPE_VERBS, last_id)
vocab += verbs
id = last_id
log("Adjectives...", tabs=1)
adj, last_id = csv_to_vocab("Adjektive.csv", TYPE_ADJECTIVES, last_id)
vocab += adj
id = last_id
2018-10-02 13:49:53 +00:00
log("Adverbs...", tabs=1)
adj, last_id = csv_to_vocab("Adverbien.csv", TYPE_ADVERBS, last_id)
vocab += adj
2018-10-11 15:25:19 +00:00
if os.getenv("DEBUG") != None:
log("{} entries generated".format(len(vocab)))
sys.exit()
2018-10-02 13:49:53 +00:00
# Connect to the database
log("Inserting vocabulary into database")
log("Connecting...", tabs=1)
client = pymongo.MongoClient(sys.argv[1])
log("Getting DB...", tabs=1)
db = client[sys.argv[2]]
log("Inserting...", tabs=1)
res = db["vocabulary"].insert_many(vocab)
if (len(res.inserted_ids) != len(vocab)):
log("Not enough elements were added to the database", err=True, tabs=1)
else:
log("Success", tabs=1)