Skip to content

Commit

Permalink
lineas agregadas
Browse files Browse the repository at this point in the history
Funcionalidad de líneas agregadas para la validación de destinos
  • Loading branch information
sanapolsky committed Aug 17, 2023
1 parent e3826db commit d27bccb
Show file tree
Hide file tree
Showing 9 changed files with 164 additions and 76 deletions.
Binary file modified docs/configuraciones.xlsx
Binary file not shown.
25 changes: 20 additions & 5 deletions urbantrips/carto/carto.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,39 @@ def update_stations_catchment_area(ring_size):
# Leer las paradas en base a las etapas
q = """
select id_linea,h3_o as parada from etapas
group by id_linea,h3_o having count(*) >1 and parada <> 0
"""
paradas_etapas = pd.read_sql(q, conn_data)

metadata_lineas = pd.read_sql_query(
"""
SELECT *
FROM metadata_lineas
""",
conn_insumos,
)

paradas_etapas = paradas_etapas.merge(metadata_lineas[['id_linea',
'id_linea_agg']],
how='left',
on='id_linea').drop(['id_linea'], axis=1)

paradas_etapas = paradas_etapas.groupby(['id_linea_agg', 'parada'], as_index=False).size()
paradas_etapas = paradas_etapas[(paradas_etapas['size']>1)].drop(['size'], axis=1)

# Leer las paradas ya existentes en la matriz
q = """
select distinct id_linea, parada, 1 as m from matriz_validacion
select distinct id_linea_agg, parada, 1 as m from matriz_validacion
"""
paradas_en_matriz = pd.read_sql(q, conn_insumos)

# Detectar que paradas son nuevas para cada linea
paradas_nuevas = paradas_etapas\
.merge(paradas_en_matriz,
on=['id_linea', 'parada'],
on=['id_linea_agg', 'parada'],
how='left')

paradas_nuevas = paradas_nuevas.loc[paradas_nuevas.m.isna(), [
'id_linea', 'parada']]
'id_linea_agg', 'parada']]

if len(paradas_nuevas) > 0:
areas_influencia_nuevas = pd.concat(
Expand Down
128 changes: 74 additions & 54 deletions urbantrips/carto/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,83 +276,103 @@ def process_routes_metadata():
and uploads metadata to the db
"""


conn_insumos = iniciar_conexion_db(tipo='insumos')
conn_data = iniciar_conexion_db(tipo='data')

# Deletes old data
conn_insumos.execute("DELETE FROM metadata_lineas;")
conn_insumos.execute("DELETE FROM metadata_ramales;")
conn_insumos.commit()

configs = leer_configs_generales()

try:
tabla_lineas = configs["nombre_archivo_informacion_lineas"]
branches_present = configs["lineas_contienen_ramales"]
except KeyError:
tabla_lineas = None
branches_present = False
print("No hay tabla con informacion configs")

if tabla_lineas is not None:
print('Leyendo tabla con informacion de lineas')
ruta = os.path.join("data", "data_ciudad", tabla_lineas)
info = pd.read_csv(ruta)

# Check all columns are present
if branches_present:
cols = ['id_linea', 'nombre_linea',
'id_ramal', 'nombre_ramal', 'modo']
else:
cols = ['id_linea', 'nombre_linea', 'modo']

assert pd.Series(cols).isin(info.columns).all()

# Check modes matches config standarized modes
try:
modos_homologados = configs["modos"]
zipped = zip(modos_homologados.values(),
modos_homologados.keys())
modos_homologados = {k: v for k, v in zipped}

assert pd.Series(info.modo.unique()).isin(
modos_homologados.keys()).all()

info['modo'] = info['modo'].replace(modos_homologados)

except KeyError:
pass
# Check modes matches config standarized modes
try:
modos_homologados = configs["modos"]
zipped = zip(modos_homologados.values(),
modos_homologados.keys())
modos_homologados = {k: v for k, v in zipped}

except KeyError:
pass

# líneas es obligatorio

print('Leyendo tabla con informacion de lineas')
ruta = os.path.join("data", "data_ciudad", tabla_lineas)
info = pd.read_csv(ruta)

# Check no missing data in line or branches
# Check all columns are present
if branches_present:
cols = ['id_linea', 'nombre_linea',
'id_ramal', 'nombre_ramal', 'modo']
else:
cols = ['id_linea', 'nombre_linea', 'modo']

assert pd.Series(cols).isin(info.columns).all(), f"La tabla {ruta} debe tener los campos: {cols}"

if 'id_linea_agg' not in info.columns:
info['id_linea_agg'] = info['id_linea']
info['nombre_linea_agg'] = info['nombre_linea']
cols += ['id_linea_agg', 'nombre_linea_agg']

cols += [i for i in info.columns if i not in cols]
cols = [i for i in info.columns if i in ["id_linea",
"nombre_linea",
"id_linea_agg",
"nombre_linea_agg",
"modo",
"empresa",
"descripcion" ]]

assert pd.Series(info.modo.unique()).isin(
modos_homologados.keys()).all()

info['modo'] = info['modo'].replace(modos_homologados)

assert not info.id_linea.isna().any()
# assert info.dtypes['id_linea'] == int # me tiraba error aca, forcé la conversión a string para que avance
info['id_linea'] = info['id_linea'].astype(str)
info.loc[info.id_linea_agg.isna(), 'nombre_linea_agg'] = info.loc[info.id_linea_agg.isna(), 'nombre_linea']
info.loc[info.id_linea_agg.isna(), 'id_linea_agg'] = info.loc[info.id_linea_agg.isna(), 'id_linea']

lineas_cols = ['id_linea', 'nombre_linea',
'modo', 'empresa', 'descripcion']
info = info.reindex(columns=cols)

info.to_sql(
"metadata_lineas", conn_insumos, if_exists="replace",
index=False)

info_lineas = info.reindex(columns=lineas_cols)
if branches_present:
info_lineas = info.drop_duplicates(subset='id_linea')

if branches_present:
info_lineas = info_lineas.drop_duplicates(subset='id_linea')
ramales_cols = ['id_ramal', 'id_linea',
'nombre_ramal', 'modo']

ramales_cols = ['id_ramal', 'id_linea',
'nombre_ramal', 'modo', 'empresa',
'descripcion']
ramales_cols += [i for i in info_lineas.columns if i not in ramales_cols]
ramales_cols = [i for i in info_lineas.columns if i in ["id_ramal",
"id_linea",
"nombre_ramal",
"modo",
"empresa",
"descripcion"]]

info_ramales = info.reindex(columns=ramales_cols)

# Checks for missing and duplicated
assert not info_ramales.id_ramal.isna().any()
assert not info_ramales.id_ramal.duplicated().any()
# assert info_ramales.dtypes['id_ramal'] == int
info['id_ramal'] = info['id_ramal'].astype(str)
info_ramales = info.reindex(columns=ramales_cols)

info_ramales.to_sql(
"metadata_ramales", conn_insumos, if_exists="replace",
index=False)
# Checks for missing and duplicated
assert not info_ramales.id_ramal.isna().any(), "Existen nulos en el campo id_ramal"
assert not info_ramales.id_ramal.duplicated().any(), "Existen duplicados en id_ramal"

info_lineas.to_sql(
"metadata_lineas", conn_insumos, if_exists="replace",
index=False)
info_ramales.to_sql(
"metadata_ramales", conn_insumos, if_exists="replace",
index=False)

except KeyError:
print("No hay tabla con informacion configs")
conn_insumos.close()


Expand Down
File renamed without changes.
19 changes: 19 additions & 0 deletions urbantrips/datamodel/transactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def create_transactions(geolocalizar_trx_config,
)
print(trx.shape)


trx, tmp_trx_inicial = agrego_factor_expansion(trx, conn)

# Guardo los días que se están analizando en la corrida actual
Expand All @@ -108,6 +109,14 @@ def create_transactions(geolocalizar_trx_config,
1,
var_fex='factor_expansion')

agrego_indicador(
trx,
'Registros válidas en transacciones',
'transacciones',
1,
var_fex='')


# chequear que no haya faltantes en id
if trx["id"].isna().any():
warnings.warn("Hay faltantes en el id que identifica a las trx")
Expand Down Expand Up @@ -203,6 +212,8 @@ def create_transactions(geolocalizar_trx_config,
print('Borrar informacion de tarjetas con transacciones no validas')
trx = trx.loc[trx.id_tarjeta.isin(tmp_trx_limpio.id_tarjeta), :]



agrego_indicador(
trx,
'Cantidad de transacciones limpias',
Expand Down Expand Up @@ -492,6 +503,14 @@ def geolocalizar_trx(
print('Levanta archivo de transacciones', ruta_trx_eco)
trx_eco = pd.read_csv(ruta_trx_eco, dtype={id_tarjeta_trx: 'str'})

agrego_indicador(
trx_eco,
'Registros en transacciones',
'transacciones',
1,
var_fex='')


print("Filtrando transacciones invalidas:", tipo_trx_invalidas)
# Filtrar transacciones invalidas
if tipo_trx_invalidas is not None:
Expand Down
45 changes: 33 additions & 12 deletions urbantrips/destinations/destinations.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def infer_destinations():
print(mensaje)

conn_data = iniciar_conexion_db(tipo='data')
conn_insumos = iniciar_conexion_db(tipo='insumos')

dias_ultima_corrida = pd.read_sql_query(
"""
Expand All @@ -55,6 +56,19 @@ def infer_destinations():

etapas = pd.read_sql_query(q, conn_data)

metadata_lineas = pd.read_sql_query(
"""
SELECT *
FROM metadata_lineas
""",
conn_insumos,
)

etapas = etapas.merge(metadata_lineas[['id_linea',
'id_linea_agg']],
how='left',
on='id_linea')

if 'od_validado' in etapas.columns:
etapas = etapas.drop(['od_validado'], axis=1)
if 'h3_d' in etapas.columns:
Expand Down Expand Up @@ -93,9 +107,16 @@ def infer_destinations():
conn_data.execute(query)
conn_data.commit()

etapas.to_sql("etapas", conn_data, if_exists="append", index=False)
etapas = etapas.drop(['id_linea_agg'],
axis=1)

etapas.to_sql("etapas",
conn_data,
if_exists="append",
index=False)

conn_data.close()
conn_insumos.close()

return None

Expand Down Expand Up @@ -140,14 +161,14 @@ def imputar_destino_min_distancia(etapas):

# crear un df con el id de cada etapa, la linea que uso y la etapa
# siguiente
lag_etapas = etapas.copy().reindex(columns=['id', 'id_linea', 'h3_d'])\
lag_etapas = etapas.copy().reindex(columns=['id', 'id_linea_agg', 'h3_d'])\
.rename(columns={'h3_d': 'lag_etapa'})
del etapas

# Obtener las paradas candidatas que compartan la misma linea
# y esten dentro del area de influencia
lag_etapas_no_dups = lag_etapas.reindex(
columns=['id_linea', 'lag_etapa']).drop_duplicates()
columns=['id_linea_agg', 'lag_etapa']).drop_duplicates()
lag_etapas_no_dups['id'] = range(len(lag_etapas_no_dups))

paradas_candidatas = pd.DataFrame()
Expand Down Expand Up @@ -175,7 +196,7 @@ def imputar_destino_min_distancia(etapas):
paradas_candidatas_sample])
paradas_candidatas = paradas_candidatas.drop(['id'], axis=1)
paradas_candidatas = lag_etapas.merge(
paradas_candidatas, on=['id_linea', 'lag_etapa'], how='left')
paradas_candidatas, on=['id_linea_agg', 'lag_etapa'], how='left')

# Imprimir estadisticos de las distancias
print("Promedios de distancia entre la etapa siguiente y ")
Expand Down Expand Up @@ -215,8 +236,8 @@ def minimizar_distancia_parada_candidata(
the stops that minimices the distances to all possible stops from that line
"""
paradas_candidatas_sample = paradas_candidatas\
.merge(matriz_validacion, left_on=['id_linea', 'lag_etapa'],
right_on=['id_linea', 'area_influencia'])\
.merge(matriz_validacion, left_on=['id_linea_agg', 'lag_etapa'],
right_on=['id_linea_agg', 'area_influencia'])\
.rename(columns={'parada': 'h3_d'})\
.drop(['area_influencia',
], axis=1)
Expand Down Expand Up @@ -247,30 +268,30 @@ def validar_destinos(destinos):
conn_insumos = iniciar_conexion_db(tipo='insumos')

matriz_validacion = pd.read_sql_query(
"""SELECT distinct id_linea, area_influencia from matriz_validacion""",
"""SELECT distinct id_linea_agg, area_influencia from matriz_validacion""",
conn_insumos,
)
# Crear pares od unicos por linea
pares_od_linea = destinos.reindex(
columns=["h3_o", "h3_d", "id_linea"]
columns=["h3_o", "h3_d", "id_linea_agg"]
).drop_duplicates()

# validar esos pares od con los hrings
pares_od_linea = pares_od_linea.merge(
matriz_validacion,
how="left",
left_on=["id_linea", "h3_d"],
right_on=["id_linea", "area_influencia"],
left_on=["id_linea_agg", "h3_d"],
right_on=["id_linea_agg", "area_influencia"],
)
pares_od_linea["od_validado"] = pares_od_linea['area_influencia'].notna(
).fillna(0)

# Pasar de pares od a cada etapa
pares_od_linea = pares_od_linea.reindex(
columns=["h3_o", "h3_d", "id_linea", "od_validado"]
columns=["h3_o", "h3_d", "id_linea_agg", "od_validado"]
)
destinos = destinos.merge(
pares_od_linea, how="left", on=["h3_o", "h3_d", "id_linea"]
pares_od_linea, how="left", on=["h3_o", "h3_d", "id_linea_agg"]
)
# Seleccionar columnas y convertir en int
destinos = destinos.reindex(columns=["id", "h3_d", "od_validado"])
Expand Down
2 changes: 1 addition & 1 deletion urbantrips/tests/data/matriz_validacion_amba_test.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id_linea,parada,area_influencia
id_linea_agg,parada,area_influencia
1,88c2e314b1fffff,88c2e314a3fffff
1,88c2e314b1fffff,88c2e31491fffff
1,88c2e314b1fffff,88c2e31495fffff
Expand Down
8 changes: 5 additions & 3 deletions urbantrips/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def create_db():
"""
CREATE TABLE IF NOT EXISTS matriz_validacion
(
id_linea int,
id_linea_agg int,
parada text,
area_influencia text
)
Expand All @@ -310,9 +310,11 @@ def create_db():
conn_insumos.execute(
"""
CREATE TABLE IF NOT EXISTS metadata_lineas
(id_linea INT PRIMARY KEY NOT NULL,
(id_linea INT PRIMARY KEY NOT NULL,
nombre_linea text not null,
modo text not null,
id_linea_agg INT,
nombre_linea_agg ING
modo text,
empresa text,
descripcion text
)
Expand Down
Loading

0 comments on commit d27bccb

Please sign in to comment.