# -*- coding: utf-8 -*-
# import cProfile
# import xml.etree.ElementTree as et
# import time
import requests
import shutil
from lxml import etree
import sqlite3
import gzip
import send2trash
import os
from datetime import datetime
# The function opens a file and parses the extracted data into the database
def parse_to_DB(file):
print("Processing ", str(file))
conn = sqlite3.connect('justice_v8.db')
c = conn.cursor()
for event, element in etree.iterparse(file, tag="Subjekt"):
# Bugfix for companies which have been deleted but appear in the list of existing companies
if ([element.find('vymazDatum')][0]) != None:
continue
else:
ICO = get_ICO(element)
# Vlozit prazdny radek s ICO
sql_id = insert_new_ICO(c, ICO, conn)
# Vlozit jednolive parametry
insert_prop(c, get_prop(element, "nazev"), conn, ICO, "nazev")
insert_prop(c, get_prop(element, "zapisDatum"), conn, ICO, "zapis")
insert_prop(c, get_prop(element, "vymazDatum"), conn, ICO, "vymaz")
insert_prop(c, get_prop(element, ".//udaje/Udaj/spisZn/oddil"), conn, ICO, "oddil")
insert_prop(c, get_prop(element, ".//udaje/Udaj/spisZn/vlozka"), conn, ICO, "vlozka")
insert_prop(c, get_prop(element, ".//udaje/Udaj/spisZn/soud/kod"), conn, ICO, "soud")
address_temp = str(adresa(get_SIDLO_v2(element)))
insert_prop(c, address_temp, conn, ICO, "sidlo")
insert_prop(c, get_prop(element, ".//udaje/Udaj/adresa/obec"), conn, ICO, "obec")
insert_prop(c, get_prop(element, ".//udaje/Udaj/adresa/ulice"), conn, ICO, "ulice")
insert_prop(c, get_prop(element, ".//udaje/Udaj/pravniForma/nazev"), conn, ICO, "pravni_forma")
# insert_prop_v2(c, find_business(element), conn, ICO, "predmet_podnikani", "predmet_podnikani", sql_id)
insert_obec(c, get_prop(element, ".//udaje/Udaj/adresa/obec"), conn, ICO, sql_id)
# insert_adresa(c, address_temp, conn, ICO, sql_id)
insert_ulice(c, get_prop(element, ".//udaje/Udaj/adresa/ulice"), conn, ICO, sql_id)
temp_adresy = zkusit_najit_vsechny_adresy(element)
for elem in temp_adresy:
insert_adresa(c, elem, conn, ICO, sql_id)
temp_osoby = zkusit_najit_vsechny_osoby(element)
for elem in temp_osoby:
insert_osoba(c, elem, conn, ICO, sql_id)
# for elem in temp_osoby:
# insert_osoba(c, elem, conn, ICO, sql_id)
# insert_prop(c, get_prop(element, ".//udaje/Udaj/adresa/obec"), conn, ICO, "sidlo")
# insert_prop(c, str(adresa(get_SIDLO(".//udaje/Udaj/adresa"))), conn, ICO, "sidlo")
# insert_prop(c, get_prop(element, ".//udaje/Udaj/adresa"), conn, ICO, "sidlo")
# Now, I need to go deeper into the file to extract data about the registered office
# subjekt_udaje = element.findall('.//Udaj')
# for udaj in subjekt_udaje:
# udaje_spolecnosti = udaj.findall(".//kod")
# if "SIDLO" in udaje_spolecnosti[0].text and sidlo_set == False:
# try:
# insert_prop(c, str(adresa(get_SIDLO(udaj))), conn, ICO, "sidlo")
# # print(sidlo)
# # spolecnosti2[ICO].set_SIDLO(get_SIDLO(udaj,udaje_spolecnosti))
# # c.execute("UPDATE spolecnosti SET sidlo = (?) WHERE ICO = (?)", (str(get_SIDLO(udaj,udaje_spolecnosti)), ICO,))
# # conn.commit()
# sidlo_set = True
# except:
# print("Zkusil jsem to a nevyslo to!")
# sidlo_set = False
element.clear()
# subjekt_udaje.clear()
# c.execute("DELETE FROM companies")
# c.execute("DELETE FROM obce")
# c.execute("DELETE FROM adresy")
# c.execute("DELETE FROM ulice")
# c.execute("DELETE FROM osoby")
conn.commit()
conn.close()
return 0
def zkusit_najit_vsechny_osoby(element):
stat_list = element.iter('osoba')
temp_osoby = []
for elem in stat_list:
try:
osoba_temp = ""
osoba_temp += get_prop(element, ".//jmeno") + " "
osoba_temp += get_prop(element, ".//prijmeni") + ", nar. "
osoba_temp += get_prop(element, ".//narozDatum")
temp_osoby.append(osoba_temp)
except:
pass
return temp_osoby
def zkusit_najit_vsechny_adresy(element):
stat_list = element.iter('adresa')
temp_adresy = []
for elem in stat_list:
temp_adresy.append(str(adresa(get_SIDLO_v3(elem))))
return temp_adresy
def find_business(element):
subjekt_udaje = element.findall('.//Udaj')
for udaj in subjekt_udaje:
udaje_spolecnosti = udaj.findall(".//kod")
if "PREDMET_PODNIKANI_SEKCE" in udaje_spolecnosti[0].text:
predmety2 = [elem.text.replace(u'\xa0', u' ') for elem in udaj.iterfind(".//hodnotaText")]
return predmety2
# TODO - Filter areas that are no longer relevant
def insert_obec(c, obec, conn, ICO, sql_id):
try:
c.execute("INSERT INTO obce (obec_jmeno) VALUES(?)", (obec,))
except:
pass
def insert_adresa(c, adresa, conn, ICO, sql_id):
try:
c.execute("INSERT INTO adresy (adresa_jmeno) VALUES(?)", (adresa,))
except:
pass
def insert_osoba(c, osoba, conn, ICO, sql_id):
try:
c.execute("INSERT INTO osoby (osoba_jmeno) VALUES(?)", (osoba,))
except:
pass
def insert_ulice(c, ulice, conn, ICO, sql_id):
try:
c.execute("INSERT INTO ulice (ulice_jmeno) VALUES(?)", (ulice,))
except:
pass
def insert_prop_v2(c, prop, conn, ICO, column, table, sql_id):
# print(column, prop, ICO)
# c.execute("UPDATE companies SET (" + column + ") = (?) WHERE ico = (?)", (prop, ICO,))
if prop != None:
for elem in prop:
# print(sql_id)
c.execute("INSERT INTO predmety_podnikani (company_id, predmet_podnikani) VALUES(?,?)", (sql_id, elem,))
# c.execute("UPDATE (%s) SET (%s, %s) = (?)" % (table, sql_id, elem), (prop, ICO,))
# Function to attempt to insert a placeholder for a new company based on ICO
def insert_new_ICO(c, ICO, conn):
try:
c.execute("INSERT INTO companies (ico) VALUES (?);", (ICO,))
return c.lastrowid
# c.execute("INSERT INTO companies VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (ICO, "", "", "", "", "", "", "", "", "", ""))
# # conn.commit()
except:
pass
def get_ICO(element):
try:
return element.find('ico').text
# return [element.find('ico')][0].text
except:
return "00000000"
def get_prop(element, prop):
try:
return element.find(prop).text
except:
return "0"
# return [element.find(prop)][0].text
def insert_prop(c, prop, conn, ICO, column):
# print(column, prop, ICO)
# c.execute("UPDATE companies SET (" + column + ") = (?) WHERE ico = (?)", (prop, ICO,))
try:
c.execute("UPDATE companies SET (%s) = (?) WHERE ico = (?)" % (column), (prop, ICO,))
except:
pass
# def insert_prop(c, prop, conn, ICO, column):
# # print(column, prop, ICO)
# # c.execute("UPDATE companies SET (" + column + ") = (?) WHERE ico = (?)", (prop, ICO,))
# try:
# c.execute("UPDATE companies SET (%s) = (?) WHERE ico = (?)" % (column), (prop, ICO,))
# except:
# pass
def get_SIDLO_v2(element):
address_field = []
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/statNazev"))
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/obec"))
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/ulice"))
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/castObce"))
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/cisloPo"))
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/cisloOr"))
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/psc"))
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/okres"))
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/adresaText"))
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/cisloEv"))
address_field.append(get_prop(element, ".//udaje/Udaj/adresa/cisloText"))
if address_field[0] == "Česká republika - neztotožněno":
address_field[0] = "Česká republika"
for i in range(len(address_field)):
if address_field[i] == "0":
address_field[i] = None
return address_field
def get_SIDLO_v3(element):
address_field = []
address_field.append(get_prop(element, ".//statNazev"))
address_field.append(get_prop(element, ".//obec"))
address_field.append(get_prop(element, ".//ulice"))
address_field.append(get_prop(element, ".//castObce"))
address_field.append(get_prop(element, ".//cisloPo"))
address_field.append(get_prop(element, ".//cisloOr"))
address_field.append(get_prop(element, ".//psc"))
address_field.append(get_prop(element, ".//okres"))
address_field.append(get_prop(element, ".//adresaText"))
address_field.append(get_prop(element, ".//cisloEv"))
address_field.append(get_prop(element, ".//cisloText"))
if address_field[0] == "Česká republika - neztotožněno":
address_field[0] = "Česká republika"
for i in range(len(address_field)):
if address_field[i] == "0":
address_field[i] = None
return address_field
# stat = get_prop(element, ".//udaje/Udaj/adresa/statNazev")
# obec = get_prop(element, ".//udaje/Udaj/adresa/obec")
# ulice = get_prop(element, ".//udaje/Udaj/adresa/ulice")
# castObce = get_prop(element, ".//udaje/Udaj/adresa/castObce")
# cisloPo = get_prop(element, ".//udaje/Udaj/adresa/cisloPo")
# cisloOr = get_prop(element, ".//udaje/Udaj/adresa/cisloOr")
# psc = get_prop(element, ".//udaje/Udaj/adresa/psc")
# okres = get_prop(element, ".//udaje/Udaj/adresa/okres")
# komplet_adresa = get_prop(element, ".//udaje/Udaj/adresa/adresaText")
# cisloEv = get_prop(element, ".//udaje/Udaj/adresa/cisloEv")
# cisloText = get_prop(element, ".//udaje/Udaj/adresa/cisloText")
# if address_field[0] != "Česká republika":
# print(address_field)
# print([stat, obec, ulice, castObce, cisloPo, cisloOr, psc, okres, komplet_adresa, cisloEv, cisloText])
# return [stat, obec, ulice, castObce, cisloPo, cisloOr, psc, okres, komplet_adresa, cisloEv, cisloText]
class adresa(object):
def __init__(self, adresa):
self.stat = adresa[0]
self.obec = adresa[1]
self.ulice = adresa[2]
self.castObce = adresa[3]
self.cisloPo = adresa[4]
self.cisloOr = adresa[5]
self.psc = adresa[6]
self.okres = adresa[7]
self.komplet_adresa = adresa[8]
self.cisloEv = adresa[9]
self.cisloText = adresa[10]
def __str__ (self):
try:
# if self.obec == "-":
# return("Neznama adresa")
if self.komplet_adresa != None:
if self.stat != None:
return str(self.komplet_adresa + " " + self.stat)
else:
return str(self.komplet_adresa)
# if self.obec == None:
# return("Neznama adresa")
if self.cisloText != None:
if self.ulice == None:
if self.psc != None:
return str(self.cisloText + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.psc + " " + self.obec + ", " + self.stat)
else:
return str(self.cisloText + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.obec + ", " + self.stat)
if self.okres == None and self.castObce != None:
if self.psc != None:
return str(self.obec + " - " + self.castObce + ", " + self.ulice + " " + self.cisloText + ", PSČ " + self.psc)
else:
return str(self.obec + " - " + self.castObce + ", " + self.ulice + " " + self.cisloText)
if self.okres == None and self.castObce == None and self.psc != None:
return str(self.obec + ", " + self.ulice + " " + self.cisloText + ", PSČ " + self.psc)
if self.castObce == None and self.psc == None:
return str(self.obec + ", " + self.ulice + " " + self.cisloText)
else:
if self.psc != None:
return str(self.obec + " " + self.cisloText + " " + "okres " + self.okres + ", PSČ " + self.psc)
else:
return str(self.obec + " " + self.cisloText + " " + "okres " + self.okres)
if self.ulice != None :
if self.cisloOr != None:
if self.cisloPo == None:
return str(self.ulice + " " + self.cisloOr + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.psc + " " + self.obec + ", " + self.stat)
elif self.psc != None:
return str(self.ulice + " " + self.cisloPo + "/" + self.cisloOr + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.psc + " " + self.obec + ", " + self.stat)
else:
return str(self.ulice + " " + self.cisloPo + "/" + self.cisloOr + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.obec + ", " + self.stat)
if self.cisloPo == None:
if self.cisloEv == None:
if self.psc != None:
return str(self.obec + ", " + self.ulice + "" + srovnat_obec_cast(self.obec, self.castObce) + ", PSČ" + self.psc + " " + self.stat)
else:
return str(self.obec + ", " + self.ulice + "" + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.stat)
else:
return str(self.ulice + " č.ev. " + self.cisloEv + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.psc + " " + self.obec + ", " + self.stat)
else:
if self.psc != None:
return str(self.ulice + " " + self.cisloPo + "" + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.psc + " " + self.obec + ", " + self.stat)
else:
return str(self.ulice + " " + self.cisloPo + "" + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.obec + ", " + self.stat)
if self.cisloPo == None and self.cisloEv != None:
return str(self.obec + " č.ev. " + self.cisloEv + ", " + self.psc + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.obec + ", " + self.stat)
if self.cisloPo != None:
return str("č.p. " + self.cisloPo + ", " + self.psc + srovnat_obec_cast(self.obec, self.castObce) + ", " + self.obec + ", " + self.stat)
if self.cisloPo == None and self.cisloEv == None and self.ulice == None:
return (self.obec + " " + self.stat)
except TypeError:
temp_adr = []
if self.ulice != None:
temp_adr.append(self.ulice)
if self.obec != None:
temp_adr.append(self.obec)
if self.castObce != None:
temp_adr.append(self.castObce)
if self.cisloPo != None:
temp_adr.append(self.cisloPo)
if self.cisloOr != None:
temp_adr.append(self.cisloOr)
if self.psc != None:
temp_adr.append(self.psc)
if self.okres != None:
temp_adr.append(self.okres)
if self.cisloEv != None:
temp_adr.append(self.cisloEv)
if self.cisloText != None:
temp_adr.append(self.cisloText)
if self.stat != None:
temp_adr.append(self.stat)
listToStr = ' '.join([str(elem) for elem in temp_adr])
return listToStr
def srovnat_obec_cast(obec, cast_obce):
if obec == cast_obce:
return str("")
elif cast_obce == None:
return str("")
else:
return str(", " + cast_obce)
def general_update(method):
typy_po = ["as", "sro", "vos", "ks", "dr", "zajzdrpo", "zahrfos", "ustav", "svj", "spolek", "prisp", "pobspolek",
"oszpo", "osznadf", "osznad", "orgzam", "odbororg", "nadf", "nad", "evrspol", "evrhzs", "evrdrspol"]
soudy = ["praha", "plzen", "brno", "ceske_budejovice", "hradec_kralove", "ostrava", "usti_nad_labem"]
# typy_po = ["as"]
# soudy = ["ostrava"]
rok = str(datetime.now().year)
for osoba in typy_po:
for soud in soudy:
if method == "down":
update_data(osoba + "-actual-" + soud + "-" + rok + ".xml.gz")
elif method == "db_update":
try:
parse_to_DB(os.path.join(str(os.getcwd()), "data", osoba) + "-actual-" + soud + "-" + rok + ".xml")
except:
pass
# print(osoba + "-actual-" + soud + "-" + rok + ".xml")
def update_data(filename):
source = "https://dataor.justice.cz/api/file/" + filename
# temp_file = "D:\\Programovani\\Moje vymysly\\Justice\\data\\temp-" + filename
temp_file = os.path.join(str(os.getcwd()), "data", "temp-" + filename)
# temp_file = str(os.getcwd()) + "\\data\\temp-" + filename
downloaded_OR = downloadOR(source)
if downloaded_OR != None:
save_temp_file(downloaded_OR, temp_file)
unzip_file(filename[:-3], temp_file)
delete_archive(temp_file)
parse_check = parseOR(temp_file[:-3])
if parse_check == True:
update_main_file(filename[:-3], temp_file[:-3])
# delete_archive(temp_file[:-3])
else:
delete_archive(temp_file[:-3])
def downloadOR(source):
download = requests.get(source, stream = True)
try:
print("Downloading file ", source)
download.raise_for_status()
except Exception as exc:
print("There was a problem: %s" % (exc))
return None
return download
def parseOR(download):
print("Parsing the file!")
try:
for event, element in etree.iterparse(download):
element.clear()
print("Parsing succsessful!")
except:
print("Parsing failed!")
return False
return True
def save_temp_file(download, temp_file):
temp_file = open(temp_file, "wb")
for chunk in download.iter_content(1000000):
temp_file.write(chunk)
temp_file.close()
def update_main_file(filename, temp_file):
shutil.move(temp_file, os.path.join(str(os.getcwd()), "data", filename))
def delete_temp_file(temp_file):
temp_file = open(temp_file, "w")
temp_file.write("0")
temp_file.close()
def unzip_file(filename, temp_file):
with gzip.open(temp_file, 'rb') as f_in:
with open(os.path.join(str(os.getcwd()), "data", "temp-" + filename), "wb") as f_out:
# with open(str(os.getcwd()) + "\\data\\temp-" + filename, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
def delete_archive(file):
send2trash.send2trash(file)
# parse_to_DB("ks-actual-ostrava-2021.xml")
# parse_to_DB("as-actual-praha-2020.xml")
# parse_to_DB("sro-actual-praha-2020.xml")
def do_both():
general_update("down")
general_update("db_update")
do_both()
# cProfile.run('do_both()')