diff --git a/abm/src/main/python/dataExporter/abmScenario.py b/abm/src/main/python/dataExporter/abmScenario.py index d5874a8..c598136 100644 --- a/abm/src/main/python/dataExporter/abmScenario.py +++ b/abm/src/main/python/dataExporter/abmScenario.py @@ -284,6 +284,14 @@ def properties(self) -> dict: "year": { "line": "scenarioYear=", "type": "int", + "value": None}, + "rsmSamplingRate":{ + "line": "rsm.default.sampling.rate=", + "type" : "float", + "value": None}, + "useDifferentialSampling":{ + "line": "use.differential.sampling=", + "type" : "int", "value": None} } @@ -637,7 +645,7 @@ def mgra_input(self) -> pd.DataFrame: "adultschenrl", "ech_dist", "hch_dist", - "pseudomsa", + #"pseudomsa", "parkarea", "hstallsoth", "hstallssam", @@ -649,7 +657,7 @@ def mgra_input(self) -> pd.DataFrame: "mstallsoth", "mstallssam", "mparkcost", - "zip09", + #"zip09", "parkactive", "openspaceparkpreserve", "beachactive", @@ -657,9 +665,9 @@ def mgra_input(self) -> pd.DataFrame: "truckregiontype", "district27", "milestocoast", - "acres", - "effective_acres", - "land_acres", + #"acres", + #"effective_acres", + #"land_acres", "MicroAccessTime", "remoteAVParking", "refueling_stations", @@ -2756,8 +2764,32 @@ def ie(self) -> pd.DataFrame: dtype={"hh_id": "int32", "transponder": "bool"}) + input_hh = pd.read_csv( + os.path.join(self.scenario_path, "input", "households.csv"), + usecols=["hhid", + "taz"], + dtype={"hhid": "int64", + "taz": "int64"}) + + rsm_zones = pd.read_csv( + os.path.join(self.scenario_path, "input", "taz_crosswalk.csv")) + + dict_clusters = dict(zip(rsm_zones["taz"], rsm_zones["cluster_id"])) + + input_hh["taz"] = input_hh["taz"].map(dict_clusters) + input_hh['scale_factor'] = 1/self.properties["rsmSamplingRate"] + + study_area_file = os.path.join(self.scenario_path, "input", "study_area.csv") + + if useDifferentialSampling & os.path.exists(study_area_file): + df = pd.read_csv(study_area_file) + study_area_taz = set(df['taz']) + rsm_zone = set(rsm_zones.loc[rsm_zones['taz'].isin(study_area_taz), 'cluster_id']) + input_hh.loc[input_hh['taz'].isin(rsm_zone), 'scale_factor'] = 1 + # if household has a transponder then all trips can use it trips = trips.merge(hh, left_on="hhID", right_on="hh_id") + trips = trips.merge(input_hh, left_on="hhID", right_on="hhid", how="left") # apply exhaustive field mappings where applicable mappings = { @@ -2832,10 +2864,12 @@ def ie(self) -> pd.DataFrame: 1 / self.properties["nonPooledTNCPassengers"], 1 / self.properties["pooledTNCPassengers"]] + sampling_rate = self.properties["rsmSamplingRate"] trips["weightTrip"] = pd.Series( - np.select(conditions, choices, default=1) / self.properties["sampleRate"], + np.select(conditions, choices, default=1), #/ (sampling_rate*sampling_rate), dtype="float32") - trips["weightPersonTrip"] = 1 / self.properties["sampleRate"] + trips["weightTrip"] = trips["weightTrip"]*trips['scale_factor'] + trips["weightPersonTrip"] = trips['scale_factor'] #1 / (sampling_rate*sampling_rate) trips["weightPersonTrip"] = trips["weightPersonTrip"].astype("float32") # rename columns to standard/generic ABM naming conventions diff --git a/abm/src/main/resources/sandag_abm.properties b/abm/src/main/resources/sandag_abm.properties index 3e9c074..6eb7414 100644 --- a/abm/src/main/resources/sandag_abm.properties +++ b/abm/src/main/resources/sandag_abm.properties @@ -1344,6 +1344,7 @@ rsm.zones = 2000 external.zones = 12 run.rsm.sampling = 0 rsm.default.sampling.rate = 0.25 +use.differential.sampling = 1 rsm.min.sampling.rate = 0.25 run.rsm.assembler = 0 rsm.centroid.connector.start.id = 55000 diff --git a/rsm/assembler.py b/rsm/assembler.py index a55c643..9227652 100644 --- a/rsm/assembler.py +++ b/rsm/assembler.py @@ -3,6 +3,7 @@ from pathlib import Path import numpy as np import pandas as pd +from rsm.utility import * logger = logging.getLogger(__name__) @@ -30,8 +31,10 @@ def rsm_assemble( rsm_joint, households, mgra_crosswalk=None, + taz_crosswalk=None, sample_rate=0.25, - run_assembler=1 + study_area_taz=None, + run_assembler=1, ): """ Assemble and evaluate RSM trip making. @@ -64,6 +67,11 @@ def rsm_assemble( Flag to indicate whether to run RSM assembler or not. 1 is to run assembler, 0 is to turn if off setting this to 0 is only an option if sampler is turned off + sample_rate : float + default/fixed sample rate if sampler was turned off + this is used to scale the trips if run_assembler is 0 + study_area_rsm_zones : list + it is list of study area RSM zones Returns ------- @@ -94,12 +102,18 @@ def rsm_assemble( mgra_crosswalk = Path(mgra_crosswalk).expanduser() assert os.path.isfile(mgra_crosswalk) + if taz_crosswalk is not None: + taz_crosswalk = Path(taz_crosswalk).expanduser() + assert os.path.isfile(taz_crosswalk) + # load trip data - partial simulation of RSM model logger.info("reading ind_trips_rsm") ind_trips_rsm = pd.read_csv(rsm_indiv) logger.info("reading jnt_trips_rsm") jnt_trips_rsm = pd.read_csv(rsm_joint) + scale_factor = int(1.0/sample_rate) + if run_assembler == 1: # load trip data - full simulation of residual/source model logger.info("reading ind_trips_full") @@ -209,20 +223,28 @@ def _agg_by_hhid_and_tripmode(df, name): # then scale the trips in the trip list using the fixed sample rate # trips in the final trip lists will be 100% scale_factor = int(1.0/sample_rate) - + + if study_area_taz: + sa_rsm = study_area_taz + else: + sa_rsm = None + # concat is slow # https://stackoverflow.com/questions/50788508/how-can-i-replicate-rows-of-a-pandas-dataframe #final_ind_trips = pd.concat([ind_trips_rsm]*scale_factor, ignore_index=True) #final_jnt_trips = pd.concat([jnt_trips_rsm]*scale_factor, ignore_index=True) - - final_ind_trips = pd.DataFrame( - np.repeat(ind_trips_rsm.values, scale_factor, axis=0), - columns=ind_trips_rsm.columns - ) - - final_jnt_trips = pd.DataFrame( - np.repeat(jnt_trips_rsm.values, scale_factor, axis=0), - columns=jnt_trips_rsm.columns - ) - + + + final_ind_trips = scaleup_to_rsm_samplingrate(ind_trips_rsm, + households, + taz_crosswalk, + scale_factor, + study_area_tazs=sa_rsm) + + final_jnt_trips = scaleup_to_rsm_samplingrate(jnt_trips_rsm, + households, + taz_crosswalk, + scale_factor, + study_area_tazs=sa_rsm) + return final_ind_trips, final_jnt_trips diff --git a/rsm/sampler.py b/rsm/sampler.py index 936d087..5baa57c 100644 --- a/rsm/sampler.py +++ b/rsm/sampler.py @@ -47,7 +47,8 @@ def rsm_household_sampler( Accessibility in the latest run is given (preloaded) or read in from here. Give as a relative path (from `input_dir`) or an absolute path. study_area : study_area (array-like) - Array of RSM zone (these are numbered 1 to N in the RSM) in the study area. These zones are sampled at 100%. + Array of RSM zone (these are numbered 1 to N in the RSM) in the study area. + These zones are sampled at 100% if differential sampling is also turned on. input_household : input_household (Path-like or pandas.DataFrame) Complete synthetic household file. This data will be filtered to match the sampling of households and written out to a new CSV file. @@ -140,7 +141,7 @@ def _resolve_out_filename(x): mgra_hh["sampling_rate"] = default_sampling_rate if study_area is not None: - mgra_hh.loc[mgra_hh.index.isin(study_area), "sample_rate"] = 1 + mgra_hh.loc[mgra_hh.index.isin(study_area), "sampling_rate"] = 1 sample_households = [] diff --git a/rsm/utility.py b/rsm/utility.py index 44ae176..f94b256 100644 --- a/rsm/utility.py +++ b/rsm/utility.py @@ -230,3 +230,78 @@ def _density_function(mgra_in): mgra_data = mgra_data.fillna(0) return mgra_data + + +def scaleup_to_rsm_samplingrate(df, + household, + taz_crosswalk, + scale_factor, + study_area_tazs=None): + """ + scales up the trips based on the sampling rate. + + """ + + hh = pd.read_csv(household) + hh = hh[['hhid', 'taz']] + + rsm_zones = pd.read_csv(taz_crosswalk) + dict_clusters = dict(zip(rsm_zones["taz"], rsm_zones["cluster_id"])) + + hh["taz"] = hh["taz"].map(dict_clusters) + hh['scale_factor'] = scale_factor + + if study_area_tazs: + hh.loc[hh['taz'].isin(study_area_tazs), 'scale_factor'] = 1 + + df = pd.merge(df, hh, left_on='hh_id', right_on='hhid', how='left') + final_df = df.loc[np.repeat(df.index, df['scale_factor'])] + final_df = final_df.drop(columns=['hhid', 'scale_factor', 'taz']) + + return final_df + +def check_column_names(df, columns): + """ + Check column names of study area file + """ + df_columns = df.columns.tolist() + if set(columns) != set(df_columns): + raise ValueError("Column names do not match the expected column names : taz and group. Please fix the column names") + return True + +def create_list_study_area_taz(study_area_file): + """ + Creates list[int or list] based on the values of the group column + """ + + try: + df = pd.read_csv(study_area_file) + columns_to_check = ['taz', 'group'] + match = check_column_names(df, columns_to_check) + + except ValueError as e: + print("Error:", str(e)) + logger.info("Error:", str(e)) + return None + + grouped_taz = df.groupby('group')['taz'].apply(list).values.tolist() + + return grouped_taz + + +def find_rsm_zone_of_study_area(study_area_file, taz_crosswalk): + """ + finds the RSM zones for the study area using the TAZ crosswalks + """ + + try: + df = pd.read_csv(study_area_file) + taz_cwk = pd.read_csv(taz_crosswalk) + study_area_taz = set(df['taz']) + rsm_zone = set(taz_cwk.loc[taz_cwk['taz'].isin(study_area_taz), 'cluster_id']) + + except Exception as e: + logger.info("Error in identifying RSM zone for study area:", str(e)) + return None + + return list(rsm_zone) \ No newline at end of file diff --git a/rsm/zone_agg.py b/rsm/zone_agg.py index 30eef32..b45fd63 100644 --- a/rsm/zone_agg.py +++ b/rsm/zone_agg.py @@ -229,8 +229,7 @@ def aggregate_zones( or lists of integers (groups of MGRAs that should be aggregated exactly as given, with no less and no more) explicit_col : explicit_col (str) - The name of the column containing the ID's from `explicit_agg`, usually - 'mgra' or 'taz' + The name of the column containing the ID's from `explicit_agg`, usually 'taz' agg_instruction : agg_instruction (dict) Dictionary passed to pandas `agg` that says how to aggregate data columns. start_cluster_ids : start_cluster_ids (int, default 13) diff --git a/scripts/rsm_assembler.py b/scripts/rsm_assembler.py index 85d6481..4ce31a7 100644 --- a/scripts/rsm_assembler.py +++ b/scripts/rsm_assembler.py @@ -32,8 +32,11 @@ ORG_JOINT_TRIPS = os.path.join(org_model_dir, "output", "jointTripData_3.csv") RSM_INDIV_TRIPS = os.path.join(rsm_dir, "output", "indivTripData_" + str(iteration) + ".csv") RSM_JOINT_TRIPS = os.path.join(rsm_dir, "output", "jointTripData_" + str(iteration) + ".csv") -HOUSEHOLDS = os.path.join(org_model_dir, "input", "households.csv") + +HOUSEHOLDS = os.path.join(rsm_dir, "input", "households.csv") MGRA_CROSSWALK = os.path.join(rsm_dir, "input", "mgra_crosswalk.csv") +TAZ_CROSSWALK = os.path.join(rsm_dir, "input", "taz_crosswalk.csv") +STUDY_AREA = os.path.join(rsm_dir, "input", "study_area.csv") #creating copy of individual and joint trips file shutil.copy(RSM_INDIV_TRIPS, os.path.join(rsm_dir, "output", "indivTripData_abm_"+ str(iteration) + ".csv")) @@ -43,21 +46,39 @@ ABM_PROPERTIES = os.path.join(ABM_PROPERTIES_FOLDER, "sandag_abm.properties") RUN_ASSEMBLER = int(get_property(ABM_PROPERTIES, "run.rsm.assembler")) SAMPLE_RATE = float(get_property(ABM_PROPERTIES, "rsm.default.sampling.rate")) +USE_DIFFERENTIAL_SAMPLING = int(get_property(ABM_PROPERTIES, "use.differential.sampling")) + +if USE_DIFFERENTIAL_SAMPLING & os.path.exists(STUDY_AREA): + logging.info(f"Study Area file: {STUDY_AREA}") + study_area_taz = find_rsm_zone_of_study_area(STUDY_AREA, TAZ_CROSSWALK) + if study_area_taz is not None: + # Process the result + SA_TAZ = study_area_taz + logger.info(f"RSM Zone identified for the Study area are : {SA_TAZ}", ) + else: + # Handle the error + logger.info("Please check the study area file. 'taz' column is expected in the file to find the corresponding RSM zone.") + SA_TAZ = None + +else: + SA_TAZ = None #RSM Assembler -final_ind, final_jnt = rsm_assemble( +final_ind_trips, final_jnt_trips = rsm_assemble( ORG_INDIV_TRIPS, ORG_JOINT_TRIPS, RSM_INDIV_TRIPS, RSM_JOINT_TRIPS, HOUSEHOLDS, MGRA_CROSSWALK, + TAZ_CROSSWALK, SAMPLE_RATE, + SA_TAZ, RUN_ASSEMBLER ) #save as csv files -final_ind.to_csv(os.path.join(rsm_dir, "output", "indivTripData_" + str(iteration) + ".csv"), index = False) -final_jnt.to_csv(os.path.join(rsm_dir, "output", "jointTripData_" + str(iteration) + ".csv"), index = False) +final_ind_trips.to_csv(os.path.join(rsm_dir, "output", "indivTripData_" + str(iteration) + ".csv"), index = False) +final_jnt_trips.to_csv(os.path.join(rsm_dir, "output", "jointTripData_" + str(iteration) + ".csv"), index = False) logging.info("finished logging rsm_assembler") \ No newline at end of file diff --git a/scripts/rsm_input_aggregator.py b/scripts/rsm_input_aggregator.py index b858226..43c93f7 100644 --- a/scripts/rsm_input_aggregator.py +++ b/scripts/rsm_input_aggregator.py @@ -87,7 +87,7 @@ "TripMatrices.csv", "transponderModelAccessibilities.csv", "crossBorderTours.csv", "internalExternalTrips.csv", "visitorTours.csv", "visitorTrips.csv", "householdAVTrips.csv", "crossBorderTrips.csv", "TNCTrips.csv", "airport_out.SAN.csv", "airport_out.CBX.csv", - "TNCtrips.csv"] + "TNCtrips.csv"] #, "households.csv"] ) logging.info("finished logging rsm_input_aggregator") \ No newline at end of file diff --git a/scripts/rsm_sampler.py b/scripts/rsm_sampler.py index 5d7229a..daa1787 100644 --- a/scripts/rsm_sampler.py +++ b/scripts/rsm_sampler.py @@ -31,6 +31,7 @@ ABM_PROPERTIES_FOLDER = os.path.join(rsm_dir, "conf") ABM_PROPERTIES = os.path.join(ABM_PROPERTIES_FOLDER, "sandag_abm.properties") INPUT_RSM_DIR = os.path.join(rsm_dir, "input") +EXPILICT_AGG_TAZ = os.path.join(rsm_dir, "input", "study_area.csv") # output files OUTPUT_RSM_DIR = os.path.join(rsm_dir, "output") @@ -46,6 +47,7 @@ sampling_rate = float(get_property(ABM_PROPERTIES, "rsm.default.sampling.rate")) min_sampling_rate = float(get_property(ABM_PROPERTIES, "rsm.min.sampling.rate")) baseline_run_dir = get_property(ABM_PROPERTIES, "rsm.baseline.run.dir") +use_differential_sampling = int(get_property(ABM_PROPERTIES, "use.differential.sampling")) if run_rsm_sampling == 1: CURR_ITER_ACCESS = os.path.join( @@ -69,11 +71,29 @@ logging.info(f"Current Iteration Accessibility File: {CURR_ITER_ACCESS}") logging.info(f"Previous Iteration Accessibility File: {PREV_ITER_ACCESS}") +if use_differential_sampling & os.path.exists(EXPILICT_AGG_TAZ): + logging.info(f"Study Area file: {EXPILICT_AGG_TAZ}") + study_area_taz = find_rsm_zone_of_study_area(EXPILICT_AGG_TAZ, OUTPUT_TAZ_CROSSWALK) + if study_area_taz is not None: + # Process the result + sa_taz = study_area_taz + logger.info(f"RSM Zone identified for the Study area are : {sa_taz}", ) + else: + # Handle the error + logger.info("Please check the study area file. 'taz' column is expected in the file to find the corresponding RSM zone.") + sa_taz = None + +else: + logger.info("All RSM zones will be sampled at the deafult sampling rate") + sa_taz = None + + rsm_household_sampler( input_dir=rsm_dir, output_dir=rsm_dir, prev_iter_access=PREV_ITER_ACCESS, curr_iter_access=CURR_ITER_ACCESS, + study_area=sa_taz, input_household=FULL_ABM_SYNTH_HOUSHOLDS, input_person=FULL_ABM_SYNTH_PERSONS, taz_crosswalk=OUTPUT_TAZ_CROSSWALK, diff --git a/scripts/rsm_zone_aggregator.py b/scripts/rsm_zone_aggregator.py index 3fe8f8d..8567584 100644 --- a/scripts/rsm_zone_aggregator.py +++ b/scripts/rsm_zone_aggregator.py @@ -47,7 +47,8 @@ FULL_ABM_TRIP_DIR = os.path.join(full_model_dir, "output") FULL_ABM_SYNTH_HOUSHOLDS = os.path.join(full_model_dir, "input", "households.csv") FULL_ABM_SYNTH_PERSONS = os.path.join(full_model_dir, "input", "persons.csv") -EXPLICIT_ZONE_AGG = [] +EXPILICT_AGG_TAZ = os.path.join(rsm_main_dir, "input", "study_area.csv") + #output files RSM_ABM_PROPERTIES = os.path.join(rsm_main_dir, "conf", "sandag_abm.properties") @@ -60,6 +61,16 @@ ) logging.info("start logging rsm_zone_aggregator") + +# prepare list of MGRA that should not be aggregated or grouped together +logging.info("Check if the study area file exists in the RSM input folder") +if os.path.exists(EXPILICT_AGG_TAZ): + EXPLICIT_ZONE_AGG = create_list_study_area_taz(EXPILICT_AGG_TAZ) + logging.info("The input folder has a study_area file. The TAZs will be aggregated based on the study area file") + logging.info(EXPLICIT_ZONE_AGG) +else: + EXPLICIT_ZONE_AGG = [] + # # Zone Aggregation #