-
Notifications
You must be signed in to change notification settings - Fork 2
/
NMvW_dump.py
93 lines (63 loc) · 2.77 KB
/
NMvW_dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
## Server Manual
# - this server is a Microsoft Azure Data Factory
# `ssh` not supported, directly make SQL queries via TPC (/UDP)
# - use this command to manually connect to the DB (`tqsl` uses the TDS protocol to do interaction between Linux and Microsoft SQL servers):
# `tsql -S azuredfserv.database.windows.net -U Demouser`
# - this [stackoverflow](https://stackoverflow.com/a/65126815) post solved getting pyodbc (Python tie-in of ODBC, an API for DBMSes) to work with the server
# ### Install
# - unixODBC (`pacman -Syu unixodbc`)
# - freeTDS (`pacman -Syu freetds`)
# - msodbcsql (from https://aur.archlinux.org/packages/msodbcsql/; install from arch AUR for install instruction)
# check `odbcinst -j` or `pyodbc.drivers()`
from time import time
from datetime import datetime
import glob
from tqdm import tqdm
import os
import pandas as pd
import numpy as np
import pyodbc
server = 'tcp:azuredfserv.database.windows.net'
database = 'Azuredf'
username = 'Demouser'
password = 'Knxdde#77'
driver='{ODBC Driver 17 for SQL Server}'
def table_to_DataFrame(connection, table_name, keys=None, until=None, random_n=None):
keys = "*" if not keys else ",".join(keys)
if not until:
until = ""
until = f"TOP {until}" if until else ""
sample = f"TABLESAMPLE ({random_n} ROWS)" if random_n else ""
query = f"SELECT {until} {keys} FROM {table_name} {sample};"
print(query)
df = pd.read_sql(query, connection)
return df
def connect_to_DB():
return pyodbc.connect('DRIVER='+driver+';SERVER='+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password)
t0 = time()
with connect_to_DB() as conn:
q = "SELECT t.name, t.modify_date FROM sys.tables t"
tables = pd.read_sql(q, conn)
tables = tables[tables.name != "Person"]
table_names = list(tables.name)
print("found tables:" + "\n\t".join(table_names) + "\n\n")
to_ignore = ["Objects"]
print("ignoring tables:" + "\n\t".join(to_ignore) + "\n\n")
to_download = [n for n in table_names if not n in to_ignore]
to_download = ["Objects"]
keys_of = {
"Objects": ['DepartmentID', 'ClassificationID',
'ObjectName', 'Medium', 'Provenance','Exhibitions',
'BeginISODate', 'EndISODate', 'Dated', 'Title',
'Description', 'Notes', 'CuratorialRemarks', 'RelatedWorks',
'HistAttributions'
]
}
timestamp = datetime.now().strftime("%d_%m_%Y-%H_%M_%S")
cur_folder = f"NMvW_tables_{timestamp}"
os.mkdir(cur_folder)
with connect_to_DB() as conn:
for tbl_name in tqdm(to_download):
cur_tbl = table_to_DataFrame(conn, tbl_name,
keys=keys_of["Objects"], until=None, random_n=5000)
cur_tbl.to_csv(f"{cur_folder}/{tbl_name}.csv.gz", index=False)