import csv import sys import os import pymongo # Definitions TYPE_NOUNS = 0 TYPE_VERBS = 1 TYPE_ADJECTIVES = 2 TYPE_ADVERBS = 3 PATH_TO_VOCAB = "../data/vocab" def preprocess_row(row): return row[1:] def genus_to_datatype(gen): if (gen == "m"): return "Maskulin" if (gen == "f"): return "Feminin" if (gen == "n"): return "Neutrum" def log(msg, err=False, tabs=0): if (not err): print("[*] " + "\t" * tabs + msg) else: print("[X] " + "\t" * tabs + msg) def dbg(msg, tabs=0): print("[D] " + "\t" * tabs + msg) def csv_to_vocab(filename, type, from_id): id = from_id + 1 vocab = [] skip = 0 path = os.path.join(PATH_TO_VOCAB, filename) dbg("Reading from {0} ({1})".format(filename, path), tabs=2) with open(path, newline="") as csvfile: reader = csv.reader(csvfile, delimiter=",", quotechar="\"") for raw in reader: skip += 1 # Skip the header lines # if (skip < num_lines_to_skip + 1): if (skip < 4): continue # Remove any whitespaces in front of or after the string row = [col.strip() for col in raw] grundform = row[0] hint = "" mnemonic = "" latin = { "grundform": grundform } # The parsing depends on the type of word we're dealing with bedeutungen = [] if (type == TYPE_NOUNS): # Nomen genitiv = row[1] genus = genus_to_datatype(row[2]) bedeutungen = [row[3]] if (row[4] != ""): bedeutungen.append(row[4]) if (row[5] != ""): bedeutungen.append(row[5]) if (row[6] != ""): bedeutungen.append(row[6]) if (row[7] != ""): mnemonic = row[7] latin["genitiv"] = genitiv latin["genus"] = genus elif (type == TYPE_VERBS): # Verb praesens = row[1] bedeutungen = [row[2]] if (row[3] != ""): bedeutungen.append(row[3]) if (row[4] != ""): bedeutungen.append(row[4]) if (row[6] != ""): mnemonic = row[6] latin["praesens"] = praesens #latin["perfekt"] = perfekt elif (type == TYPE_ADJECTIVES): # Adjektiv endung_f = row[1] endung_n = row[2] bedeutungen = [row[3]] if (row[4] != ""): bedeutungen.append(row[4]) if (row[5] != ""): bedeutungen.append(row[5]) if (row[6] != ""): mnemonic = row[6] latin["endung_f"] = endung_f latin["endung_n"] = endung_n elif (type == TYPE_ADVERBS): # Adverb bedeutungen = [row[1]] if (row[2] != ""): bedeutungen.append(row[2]) if (row[3] != ""): bedeutungen.append(row[3]) if (row[4] != ""): bedeutungen.append(row[4]) if (row[5] != ""): mnemonic = row[5] if mnemonic != "": print("Found mnemonic for:", latin["grundform"]) vocab.append({ "id": id, "german": bedeutungen, "hint": hint, "mnemonic": mnemonic, "type": type, "latin": latin }) id += 1 return vocab, id log("Lateinicus CSV to Vocabulary DB Model") if len(sys.argv) < 3 and os.getenv("DEBUG") == None: log("Not enough arguments!", err=True) log("Usage: csv_vocab_to_mongo.py ", err=True) sys.exit(1) log("Generating vocabulary") id = 0 vocab = [] log("Nouns...", tabs=1) nouns, last_id = csv_to_vocab("Nomen.csv", TYPE_NOUNS, 0) vocab += nouns id = last_id log("Verbs...", tabs=1) verbs, last_id = csv_to_vocab("Verben.csv", TYPE_VERBS, last_id) vocab += verbs id = last_id log("Adjectives...", tabs=1) adj, last_id = csv_to_vocab("Adjektive.csv", TYPE_ADJECTIVES, last_id) vocab += adj id = last_id log("Adverbs...", tabs=1) adj, last_id = csv_to_vocab("Adverbien.csv", TYPE_ADVERBS, last_id) vocab += adj if os.getenv("DEBUG") != None: log("{} entries generated".format(len(vocab))) sys.exit() # Connect to the database log("Inserting vocabulary into database") log("Connecting...", tabs=1) client = pymongo.MongoClient(sys.argv[1]) log("Getting DB...", tabs=1) db = client[sys.argv[2]] log("Inserting...", tabs=1) res = db["vocabulary"].insert_many(vocab) if (len(res.inserted_ids) != len(vocab)): log("Not enough elements were added to the database", err=True, tabs=1) else: log("Success", tabs=1)