Lateinicus/scripts/csv_vocab_to_mongo.py

import csv
import sys
import os
import pymongo

# Definitions
TYPE_NOUNS = 0
TYPE_VERBS = 1
TYPE_ADJECTIVES = 2
TYPE_ADVERBS = 3

PATH_TO_VOCAB = "../data/vocab"

def preprocess_row(row):
    return row[1:]

def genus_to_datatype(gen):
    if (gen == "m"): return "Maskulin"
    if (gen == "f"): return "Feminin"
    if (gen == "n"): return "Neutrum"

def log(msg, err=False, tabs=0):
    if (not err):
        print("[*] " + "\t" * tabs + msg)
    else:
        print("[X] " + "\t" * tabs + msg)

def dbg(msg, tabs=0):
    print("[D] " + "\t" * tabs + msg)

def csv_to_vocab(filename, type, from_id):
    id = from_id + 1
    vocab = []
    skip = 0
    path = os.path.join(PATH_TO_VOCAB, filename)
    dbg("Reading from {0} ({1})".format(filename, path), tabs=2)
    with open(path, newline="") as csvfile:
        reader = csv.reader(csvfile, delimiter=",", quotechar="\"")
        for raw in reader:
            skip += 1
            # Skip the header lines
            # if (skip < num_lines_to_skip + 1):
            if (skip < 4):
                continue

            # Remove any whitespaces in front of or after the string
            row = [col.strip() for col in raw]

            grundform = row[0]
            hint = ""
            mnemonic = ""

            latin = {
                "grundform": grundform
            }

            # The parsing depends on the type of word we're dealing with
            bedeutungen = []
            if (type == TYPE_NOUNS):
                # Nomen
                genitiv = row[1]
                genus = genus_to_datatype(row[2])
                bedeutungen = [row[3]]
                if (row[4] != ""):
                    bedeutungen.append(row[4])
                if (row[5] != ""):
                    bedeutungen.append(row[5])
                if (row[6] != ""):
                    bedeutungen.append(row[6])

                if (row[7] != ""):
                    mnemonic = row[7]

                latin["genitiv"] = genitiv
                latin["genus"] = genus
            elif (type == TYPE_VERBS):
                # Verb
                praesens = row[1]
                bedeutungen = [row[2]]
                if (row[3] != ""):
                    bedeutungen.append(row[3])
                if (row[4] != ""):
                    bedeutungen.append(row[4])

                if (row[6] != ""):
                    mnemonic = row[6]

                latin["praesens"] = praesens
                #latin["perfekt"] = perfekt
            elif (type == TYPE_ADJECTIVES):
                # Adjektiv
                endung_f = row[1]
                endung_n = row[2]
                bedeutungen = [row[3]]
                if (row[4] != ""):
                    bedeutungen.append(row[4])
                if (row[5] != ""):
                    bedeutungen.append(row[5])

                if (row[6] != ""):
                    mnemonic = row[6]

                latin["endung_f"] = endung_f
                latin["endung_n"] = endung_n
            elif (type == TYPE_ADVERBS):
                # Adverb
                bedeutungen = [row[1]]
                if (row[2] != ""):
                    bedeutungen.append(row[2])
                if (row[3] != ""):
                    bedeutungen.append(row[3])
                if (row[4] != ""):
                    bedeutungen.append(row[4])

                if (row[5] != ""):
                    mnemonic = row[5]

            if mnemonic != "":
                print("Found mnemonic for:", latin["grundform"])

            vocab.append({
                "id": id,
                "german": bedeutungen,
                "hint": hint,
                "mnemonic": mnemonic,
                "type": type,
                "latin": latin
            })
            id += 1
    return vocab, id

log("Lateinicus CSV to Vocabulary DB Model")
if len(sys.argv) < 3 and os.getenv("DEBUG") == None:
    log("Not enough arguments!", err=True)
    log("Usage: csv_vocab_to_mongo.py <URI> <Database>", err=True)
    sys.exit(1)

log("Generating vocabulary")
id = 0
vocab = []

log("Nouns...", tabs=1)
nouns, last_id = csv_to_vocab("Nomen.csv", TYPE_NOUNS, 0)
vocab += nouns
id = last_id

log("Verbs...", tabs=1)
verbs, last_id = csv_to_vocab("Verben.csv", TYPE_VERBS, last_id)
vocab += verbs
id = last_id

log("Adjectives...", tabs=1)
adj, last_id = csv_to_vocab("Adjektive.csv", TYPE_ADJECTIVES, last_id)
vocab += adj
id = last_id

log("Adverbs...", tabs=1)
adj, last_id = csv_to_vocab("Adverbien.csv", TYPE_ADVERBS, last_id)
vocab += adj

if os.getenv("DEBUG") != None:
    log("{} entries generated".format(len(vocab)))
    sys.exit()

# Connect to the database
log("Inserting vocabulary into database")
log("Connecting...", tabs=1)
client = pymongo.MongoClient(sys.argv[1])
log("Getting DB...", tabs=1)
db = client[sys.argv[2]]
log("Inserting...", tabs=1)
res = db["vocabulary"].insert_many(vocab)

if (len(res.inserted_ids) != len(vocab)):
    log("Not enough elements were added to the database", err=True, tabs=1)
else:
    log("Success", tabs=1)