Lateinicus/scripts/csv_vocab_to_mongo.py

import csv
import sys
import os
import pymongo

# Definitions
TYPE_NOUNS = 0
TYPE_VERBS = 1
TYPE_ADJECTIVES = 2

PATH_TO_VOCAB = "../data/vocab"

def preprocess_row(row):
    return row[1:]

def genus_to_datatype(gen):
    if (gen == "m"): return "Maskulin"
    if (gen == "w"): return "Feminin"
    if (gen == "n"): return "Neutrum"

def log(msg, err=False, tabs=0):
    if (not err):
        print("[*] " + "\t" * tabs + msg)
    else:
        print("[X] " + "\t" * tabs + msg)

def dbg(msg, tabs=0):
    print("[D] " + "\t" * tabs + msg)

def csv_to_vocab(filename, type, from_id):
    id = from_id + 1
    vocab = []
    skip = 0
    path = os.path.join(PATH_TO_VOCAB, filename)
    dbg("Reading from {0} ({1})".format(filename, path), tabs=2)
    with open(path, newline="") as csvfile:
        reader = csv.reader(csvfile, delimiter=",", quotechar="\"")
        for raw in reader:
            skip += 1
            # Skip the header lines
            # if (skip < num_lines_to_skip + 1):
            if (skip < 4):
                continue

            # The nouns are special
            row = preprocess_row(raw) if type == TYPE_NOUNS else raw

            grundform = row[0]
            hint = ""
            mnemonic = ""

            latin = {
                "grundform": grundform
            }

            # The parsing depends on the type of word we're dealing with
            bedeutungen = []
            if (type == TYPE_NOUNS):
                # Nomen
                genitiv = row[1]
                genus = genus_to_datatype(row[2])
                bedeutungen = [row[3]]
                if (row[4] != ""):
                    bedeutungen.append(row[4])
                if (row[5] != ""):
                    bedeutungen.append(row[5])

                latin["genitiv"] = genitiv
                latin["genus"] = genus
            elif (type == TYPE_VERBS):
                # Verb
                praesens = row[1]
                perfekt = row[2]
                bedeutungen = [row[3]]
                if (row[4] != ""):
                    bedeutungen.append(row[4])
                if (row[5] != ""):
                    bedeutungen.append(row[5])

                latin["praesens"] = praesens
                latin["perfekt"] = perfekt
                latin["ppp"] = ""
            elif (type == TYPE_ADJECTIVES):
                # Adjektiv
                endung_f = row[1]
                endung_n = row[2]
                bedeutungen = [row[3]]
                if (row[4] != ""):
                    bedeutungen.append(row[4])
                if (row[5] != ""):
                    bedeutungen.append(row[5])

                latin["endung_f"] = endung_f
                latin["endung_n"] = endung_n

            # TODO: Hints and mnemonics
            vocab.append({
                "id": id,
                "german": bedeutungen,
                "hint": hint,
                "mnemonic": mnemonic,
                "type": type,
                "latin": latin
            })
            id += 1
    return vocab, id

log("Lateinicus CSV to Vocabulary DB Model")
if (len(sys.argv) < 3):
    log("Not enough arguments!", err=True)
    log("Usage: csv_vocab_to_mongo.py <URI> <Database>", err=True)
    sys.exit(1)

log("Generating vocabulary")
id = 0
vocab = []
# Nouns
log("Nouns...", tabs=1)
nouns, last_id = csv_to_vocab("Nomen.csv", TYPE_NOUNS, 0)
vocab += nouns
id = last_id

log("Verbs...", tabs=1)
verbs, last_id = csv_to_vocab("Verben.csv", TYPE_VERBS, last_id)
vocab += verbs
id = last_id

log("Adjectives...", tabs=1)
adj, last_id = csv_to_vocab("Adjektive.csv", TYPE_ADJECTIVES, last_id)
vocab += adj
id = last_id

# Connect to the database
log("Inserting vocabulary into database")
log("Connecting...", tabs=1)
client = pymongo.MongoClient(sys.argv[1])
log("Getting DB...", tabs=1)
db = client[sys.argv[2]]
log("Inserting...", tabs=1)
res = db["vocabulary"].insert_many(vocab)

if (len(res.inserted_ids) != len(vocab)):
    log("Not enough elements were added to the database", err=True, tabs=1)
else:
    log("Success", tabs=1)