Lateinicus/scripts/csv_vocab_to_mongo.py

import csv
import sys
import os
import pymongo

# Definitions
TYPE_NOUNS = 0
TYPE_VERBS = 1
TYPE_ADJECTIVES = 2
TYPE_ADVERB = 3

PATH_TO_VOCAB = "../data/vocab"

def preprocess_row(row):
    return row[1:]

def genus_to_datatype(gen):
    if (gen == "m"): return "Maskulin"
    if (gen == "f"): return "Feminin"
    if (gen == "n"): return "Neutrum"

def log(msg, err=False, tabs=0):
    if (not err):
        print("[*] " + "\t" * tabs + msg)
    else:
        print("[X] " + "\t" * tabs + msg)

def dbg(msg, tabs=0):
    print("[D] " + "\t" * tabs + msg)
   
def csv_to_vocab(filename, type, from_id):
    id = from_id + 1
    vocab = []
    skip = 0
    path = os.path.join(PATH_TO_VOCAB, filename)
    dbg("Reading from {0} ({1})".format(filename, path), tabs=2)
    with open(path, newline="") as csvfile:
        reader = csv.reader(csvfile, delimiter=",", quotechar="\"")
        for raw in reader:
            skip += 1
            # Skip the header lines
            # if (skip < num_lines_to_skip + 1):
            if (skip < 4):
                continue

            # The nouns are special
            row = preprocess_row(raw) if type == TYPE_NOUNS else raw
            
            grundform = row[0]
            hint = ""
            mnemonic = ""

            latin = {
                "grundform": grundform
            }

            # The parsing depends on the type of word we're dealing with
            bedeutungen = []
            if (type == TYPE_NOUNS):
                # Nomen
                genitiv = row[1]
                genus = genus_to_datatype(row[2])
                bedeutungen = [row[3]]
                if (row[4] != ""):
                    bedeutungen.append(row[4])
                if (row[5] != ""):
                    bedeutungen.append(row[5])

                latin["genitiv"] = genitiv
                latin["genus"] = genus
            elif (type == TYPE_VERBS):
                # Verb
                praesens = row[1]
                perfekt = row[2]
                bedeutungen = [row[3]]
                if (row[4] != ""):
                    bedeutungen.append(row[4])
                if (row[5] != ""):
                    bedeutungen.append(row[5])

                latin["praesens"] = praesens
                latin["perfekt"] = perfekt
                latin["ppp"] = ""
            elif (type == TYPE_ADJECTIVES):
                # Adjektiv
                endung_f = row[1]
                endung_n = row[2]
                bedeutungen = [row[3]]
                if (row[4] != ""):
                    bedeutungen.append(row[4])
                if (row[5] != ""):
                    bedeutungen.append(row[5])

                latin["endung_f"] = endung_f
                latin["endung_n"] = endung_n
            elif (type == TYPE_ADVERB):
                # Adverb
                bedeutungen = [row[1]]
                if (row[2] != ""):
                    bedeutungen.append(row[2])
                if (row[3] != ""):
                    bedeutungen.append(row[3])
               
            # TODO: Hints and mnemonics
            vocab.append({
                "id": id,
                "german": bedeutungen,
                "hint": hint,
                "mnemonic": mnemonic,
                "type": type,
                "latin": latin 
            })
            id += 1
    return vocab, id

log("Lateinicus CSV to Vocabulary DB Model")
if (len(sys.argv) < 3):
    log("Not enough arguments!", err=True)
    log("Usage: csv_vocab_to_mongo.py <URI> <Database>", err=True)
    sys.exit(1)

log("Generating vocabulary")
id = 0
vocab = []
# Nouns
log("Nouns...", tabs=1)
nouns, last_id = csv_to_vocab("Nomen.csv", TYPE_NOUNS, 0)
vocab += nouns
id = last_id

log("Verbs...", tabs=1)
verbs, last_id = csv_to_vocab("Verben.csv", TYPE_VERBS, last_id)
vocab += verbs
id = last_id

log("Adjectives...", tabs=1)
adj, last_id = csv_to_vocab("Adjektive.csv", TYPE_ADJECTIVES, last_id)
vocab += adj
id = last_id

log("Adverbs...", tabs=1)
adj, last_id = csv_to_vocab("Adverbien.csv", TYPE_ADVERBS, last_id)
vocab += adj


# Connect to the database
log("Inserting vocabulary into database")
log("Connecting...", tabs=1)
client = pymongo.MongoClient(sys.argv[1])
log("Getting DB...", tabs=1)
db = client[sys.argv[2]]
log("Inserting...", tabs=1)
res = db["vocabulary"].insert_many(vocab)

if (len(res.inserted_ids) != len(vocab)):
    log("Not enough elements were added to the database", err=True, tabs=1)
else:
    log("Success", tabs=1)
scripts: Add scripts to help testing and administration 2018-10-01 15:08:37 +00:00			`import csv`
			`import sys`
			`import os`
			`import pymongo`

			`# Definitions`
			`TYPE_NOUNS = 0`
			`TYPE_VERBS = 1`
			`TYPE_ADJECTIVES = 2`
feat: Don't forget about the adverbs 2018-10-02 13:49:53 +00:00			`TYPE_ADVERB = 3`
scripts: Add scripts to help testing and administration 2018-10-01 15:08:37 +00:00
			`PATH_TO_VOCAB = "../data/vocab"`

			`def preprocess_row(row):`
			`return row[1:]`

			`def genus_to_datatype(gen):`
			`if (gen == "m"): return "Maskulin"`
fix: Wrong mapping for 'feminine Nomen' 2018-10-02 13:29:03 +00:00			`if (gen == "f"): return "Feminin"`
scripts: Add scripts to help testing and administration 2018-10-01 15:08:37 +00:00			`if (gen == "n"): return "Neutrum"`

			`def log(msg, err=False, tabs=0):`
			`if (not err):`
			`print("[] " + "\t" tabs + msg)`
			`else:`
			`print("[X] " + "\t" * tabs + msg)`

			`def dbg(msg, tabs=0):`
			`print("[D] " + "\t" * tabs + msg)`

			`def csv_to_vocab(filename, type, from_id):`
			`id = from_id + 1`
			`vocab = []`
			`skip = 0`
			`path = os.path.join(PATH_TO_VOCAB, filename)`
			`dbg("Reading from {0} ({1})".format(filename, path), tabs=2)`
			`with open(path, newline="") as csvfile:`
			`reader = csv.reader(csvfile, delimiter=",", quotechar="\"")`
			`for raw in reader:`
			`skip += 1`
			`# Skip the header lines`
			`# if (skip < num_lines_to_skip + 1):`
			`if (skip < 4):`
			`continue`

			`# The nouns are special`
			`row = preprocess_row(raw) if type == TYPE_NOUNS else raw`

			`grundform = row[0]`
			`hint = ""`
			`mnemonic = ""`

			`latin = {`
			`"grundform": grundform`
			`}`

			`# The parsing depends on the type of word we're dealing with`
			`bedeutungen = []`
			`if (type == TYPE_NOUNS):`
			`# Nomen`
			`genitiv = row[1]`
			`genus = genus_to_datatype(row[2])`
			`bedeutungen = [row[3]]`
			`if (row[4] != ""):`
			`bedeutungen.append(row[4])`
			`if (row[5] != ""):`
			`bedeutungen.append(row[5])`

			`latin["genitiv"] = genitiv`
			`latin["genus"] = genus`
			`elif (type == TYPE_VERBS):`
			`# Verb`
			`praesens = row[1]`
			`perfekt = row[2]`
			`bedeutungen = [row[3]]`
			`if (row[4] != ""):`
			`bedeutungen.append(row[4])`
			`if (row[5] != ""):`
			`bedeutungen.append(row[5])`

			`latin["praesens"] = praesens`
			`latin["perfekt"] = perfekt`
			`latin["ppp"] = ""`
			`elif (type == TYPE_ADJECTIVES):`
			`# Adjektiv`
			`endung_f = row[1]`
			`endung_n = row[2]`
			`bedeutungen = [row[3]]`
			`if (row[4] != ""):`
			`bedeutungen.append(row[4])`
			`if (row[5] != ""):`
			`bedeutungen.append(row[5])`

fix: Remove the sys.exit() 2018-10-02 11:10:10 +00:00			`latin["endung_f"] = endung_f`
			`latin["endung_n"] = endung_n`
feat: Don't forget about the adverbs 2018-10-02 13:49:53 +00:00			`elif (type == TYPE_ADVERB):`
			`# Adverb`
			`bedeutungen = [row[1]]`
			`if (row[2] != ""):`
			`bedeutungen.append(row[2])`
			`if (row[3] != ""):`
			`bedeutungen.append(row[3])`

scripts: Add scripts to help testing and administration 2018-10-01 15:08:37 +00:00			`# TODO: Hints and mnemonics`
			`vocab.append({`
			`"id": id,`
			`"german": bedeutungen,`
			`"hint": hint,`
			`"mnemonic": mnemonic,`
			`"type": type,`
			`"latin": latin`
			`})`
			`id += 1`
			`return vocab, id`

			`log("Lateinicus CSV to Vocabulary DB Model")`
			`if (len(sys.argv) < 3):`
			`log("Not enough arguments!", err=True)`
			`log("Usage: csv_vocab_to_mongo.py <URI> <Database>", err=True)`
			`sys.exit(1)`

			`log("Generating vocabulary")`
			`id = 0`
			`vocab = []`
			`# Nouns`
			`log("Nouns...", tabs=1)`
			`nouns, last_id = csv_to_vocab("Nomen.csv", TYPE_NOUNS, 0)`
			`vocab += nouns`
			`id = last_id`

			`log("Verbs...", tabs=1)`
			`verbs, last_id = csv_to_vocab("Verben.csv", TYPE_VERBS, last_id)`
			`vocab += verbs`
			`id = last_id`

			`log("Adjectives...", tabs=1)`
			`adj, last_id = csv_to_vocab("Adjektive.csv", TYPE_ADJECTIVES, last_id)`
			`vocab += adj`
			`id = last_id`

feat: Don't forget about the adverbs 2018-10-02 13:49:53 +00:00			`log("Adverbs...", tabs=1)`
			`adj, last_id = csv_to_vocab("Adverbien.csv", TYPE_ADVERBS, last_id)`
			`vocab += adj`


scripts: Add scripts to help testing and administration 2018-10-01 15:08:37 +00:00			`# Connect to the database`
			`log("Inserting vocabulary into database")`
			`log("Connecting...", tabs=1)`
			`client = pymongo.MongoClient(sys.argv[1])`
			`log("Getting DB...", tabs=1)`
			`db = client[sys.argv[2]]`
			`log("Inserting...", tabs=1)`
			`res = db["vocabulary"].insert_many(vocab)`

			`if (len(res.inserted_ids) != len(vocab)):`
			`log("Not enough elements were added to the database", err=True, tabs=1)`
			`else:`
			`log("Success", tabs=1)`