Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

parallel data conversion: shortcut computation #190

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 53 additions & 23 deletions scripts/file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,23 @@
import functools
import itertools
import json
import multiprocessing
import random
import warnings
from dataclasses import dataclass
from os.path import abspath, join
from pathlib import Path
from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
from typing import (
Collection,
Dict,
List,
NamedTuple,
Optional,
Set,
Sized,
Tuple,
Union,
)

import h3.api.numpy_int as h3
import numpy as np
Expand Down Expand Up @@ -118,7 +131,7 @@
int2coord,
)

ShortcutMapping = Dict[int, List[int]]
N_PROCESSES = None # os.cpu_count()

nr_of_polygons = -1
nr_of_zones = -1
Expand All @@ -134,6 +147,8 @@
list_of_pointers = []
poly_nr2zone_id = []

ShortcutMapping = Dict[int, List[int]]


def _holes_in_poly(poly_nr):
for i, nr in enumerate(polynrs_of_holes):
Expand Down Expand Up @@ -180,7 +195,9 @@ def parse_polygons_from_json(input_path: Path) -> int:
# everything else is interpreted as a hole!
for hole_nr, hole in enumerate(poly_with_hole):
nr_of_holes += 1 # keep track of how many holes there are
print(f"\rpolygon {poly_id}, zone {tz_name}, hole number {nr_of_holes}, {hole_nr+1} in polygon", end="")
print(
f"\rpolygon {poly_id}, zone {tz_name}, hole number {nr_of_holes}, {hole_nr + 1} in polygon", end=""
)
polynrs_of_holes.append(poly_id)
hole_poly = to_numpy_polygon(hole)
holes.append(hole_poly)
Expand Down Expand Up @@ -508,7 +525,7 @@ def get_hex(hex_id: int) -> Hex:
return Hex.from_id(hex_id)


def optimise_shortcut_ordering(poly_ids: List[int]) -> List[int]:
def optimise_shortcut_ordering(poly_ids: Collection[int]) -> List[int]:
"""optimises the order of polygon ids for faster timezone checks

observation: as soon as just polygons of one zone are left, this zone can be returned
Expand All @@ -519,7 +536,7 @@ def optimise_shortcut_ordering(poly_ids: List[int]) -> List[int]:
-> sort the list of polygon ids in each shortcut after the size of the corresponding polygons
"""
if len(poly_ids) <= 1:
return poly_ids
return list(poly_ids)
global polygon_lengths

poly_sizes = [polygon_lengths[i] for i in poly_ids]
Expand All @@ -538,29 +555,41 @@ def optimise_shortcut_ordering(poly_ids: List[int]) -> List[int]:
return poly_ids_sorted


def get_shortcut_entry(hex_id: int) -> List[int]:
# print(f"compiling shortcut for {hex_id=}")
cell = get_hex(hex_id)
poly_ids = optimise_shortcut_ordering(cell.polys_in_cell)
return poly_ids


def compile_h3_map(candidates: Set) -> ShortcutMapping:
"""
operate on one hex resolution
also store results separately to divide the output data files
"""
mapping: ShortcutMapping = {}
total_candidates = len(candidates)

def report_progress():
nr_candidates = len(candidates)
processed = total_candidates - nr_candidates
print(
f"\r{processed:,} processed\t{nr_candidates:,} remaining\t",
end="",
)

while candidates:
hex_id = candidates.pop()
cell = get_hex(hex_id)
polys = list(cell.polys_in_cell)
mapping[hex_id] = optimise_shortcut_ordering(polys)
report_progress()

candidates = list(candidates)
# if DEBUG:
# candidates = random.sample(candidates, 200)

# total_candidates = len(candidates)
# def report_progress(nr_processed: int):
# nr_candidates = total_candidates - nr_processed
# print(
# f"\r{nr_processed:,} processed\t{nr_candidates:,} remaining\t",
# end="",
# )

print("spawning multiple worker processes...")
with multiprocessing.Pool(processes=N_PROCESSES) as pool:
print("compiling shortcut mapping...")
shortcut_entries = pool.map(get_shortcut_entry, candidates)

lengths = map(len, shortcut_entries)
# FIXME: empty mappings!
max_lenght = max(lengths)
# combine into mapping dictionary: hex_id -> polygon ids
mapping = dict(zip(candidates, shortcut_entries))
return mapping


Expand All @@ -581,6 +610,7 @@ def compile_shortcut_mapping(output_path: Path) -> int:

cf. https://eng.uber.com/h3/
"""

print("\n\ncomputing timezone polygon index ('shortcuts')...")
candidates = all_res_candidates(SHORTCUT_H3_RES)
print(
Expand Down Expand Up @@ -618,7 +648,7 @@ def validate_shortcut_completeness(mapping: ShortcutMapping):
f"(hexagon cell id {hex_id} missing in mapping)"
)
if poly_id not in shortcut_entries:
print(
warnings.warn(
f"ERR: point #{i} ({lng}, {lat}) of polygon {poly_id} "
f"does not appear in shortcut entries {shortcut_entries} of cell {hex_id}"
)
Expand Down