Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve timing of checks depending on changes since last check #163

Open
wants to merge 85 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
85 commits
Select commit Hold shift + click to select a range
739ef30
docs: update changelog
bolinocroustibat Sep 10, 2024
87d2c52
feat: select outdated checks with increasing time
bolinocroustibat Sep 11, 2024
ed5a584
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Sep 12, 2024
68f5556
fix: fix wrong config var format
bolinocroustibat Sep 13, 2024
d72ffac
tests: fix test
bolinocroustibat Sep 18, 2024
76ab578
docs: fix docstring
bolinocroustibat Sep 18, 2024
c72a100
docs: add comments
bolinocroustibat Sep 18, 2024
b8c8cac
clean: remove useless imports
bolinocroustibat Sep 18, 2024
7d07a2b
fix: update the count of outdated checks in get_crawler_status
bolinocroustibat Sep 18, 2024
825e793
docs: update docstrings
bolinocroustibat Sep 18, 2024
463f229
refactor: rename some vars
bolinocroustibat Sep 18, 2024
ab00ea7
docs: update changelog
bolinocroustibat Sep 18, 2024
0e47263
docs: update docstrings
bolinocroustibat Sep 18, 2024
1825df6
tests: add tests (wip)
bolinocroustibat Sep 18, 2024
941e894
tests: add tests for re-check before/after default delay
bolinocroustibat Sep 19, 2024
2e0fb48
tests: better names and comments
bolinocroustibat Sep 19, 2024
f07bcee
tests: refactor latest tests as parametrized (test_re_check_depending…
bolinocroustibat Sep 19, 2024
5f0c84c
feat: manage large resources exceptions differently (#148)
bolinocroustibat Sep 23, 2024
e33f090
WIP
bolinocroustibat Sep 23, 2024
6adec5c
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Sep 23, 2024
c25281c
fix: fix increasing check delay logic
bolinocroustibat Sep 23, 2024
35ecc23
tests: fix tests
bolinocroustibat Sep 23, 2024
3491421
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Sep 24, 2024
f5eef08
style: lint code
bolinocroustibat Sep 26, 2024
852663a
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Sep 26, 2024
6e6ac05
feat: use CHECK_DELAY_DEFAULT
bolinocroustibat Sep 26, 2024
431aa7f
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Sep 26, 2024
571a12d
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Sep 26, 2024
ff1fc70
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Sep 28, 2024
3daf61b
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Sep 30, 2024
524e57c
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Oct 2, 2024
70a2640
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Oct 2, 2024
3e2e6cd
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Oct 3, 2024
a8dc898
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Oct 22, 2024
9a10b7c
docs: update changelog
bolinocroustibat Oct 22, 2024
fe9e68c
docs: update comment
bolinocroustibat Oct 22, 2024
b9e4099
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Oct 24, 2024
2f59451
WIP
bolinocroustibat Oct 24, 2024
6d99e23
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Oct 24, 2024
461cf9a
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Oct 25, 2024
8c6372d
feat: add migratino to add a next_check column to checks
bolinocroustibat Oct 25, 2024
5aa528b
feat: new logic for select_batch
bolinocroustibat Oct 25, 2024
8f3e121
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Oct 29, 2024
96256a8
clean: remove useless imports
bolinocroustibat Oct 29, 2024
0d4275a
docs: add TODO
bolinocroustibat Oct 29, 2024
9661aa5
feat: add logic to calculate next_check datetime
bolinocroustibat Oct 29, 2024
79a0421
fix: better next_check behaviour
bolinocroustibat Oct 29, 2024
a7ae5dd
tests: revert test
bolinocroustibat Oct 29, 2024
04d324a
docs: add comment
bolinocroustibat Oct 29, 2024
9de1c48
fix: fix wrong var in code
bolinocroustibat Oct 30, 2024
23d5ec4
fix: fix type hint
bolinocroustibat Oct 30, 2024
80645f5
refactor: refactor crawler status logic
bolinocroustibat Oct 30, 2024
ae8ecaa
refactor: slight refactor of next_check logic
bolinocroustibat Oct 30, 2024
e809a52
clean: remove useless imports
bolinocroustibat Oct 30, 2024
3039850
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Oct 30, 2024
a69f5bc
docs: update changelog
bolinocroustibat Oct 30, 2024
4c4201e
fix: don't use CHECK_DELAY_DEFAULT
bolinocroustibat Oct 30, 2024
df0f967
refactor: change next_check to next_check_at
bolinocroustibat Oct 30, 2024
e68ab04
docs: comment fix
bolinocroustibat Oct 30, 2024
5a95774
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Oct 30, 2024
b32a703
fix: fix migration
bolinocroustibat Nov 4, 2024
e67f2a0
fix: fix crawler status bug
bolinocroustibat Nov 4, 2024
bf5a455
clean: add types
bolinocroustibat Nov 4, 2024
6285a54
refactor: calculate next_check_at date also depending on resource mod…
bolinocroustibat Nov 6, 2024
5df8670
docs: update docstring
bolinocroustibat Nov 6, 2024
3a14cd1
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 6, 2024
9adbcff
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 6, 2024
eef5f21
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 6, 2024
7f0348f
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 6, 2024
0f1e2e1
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 7, 2024
913913d
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 8, 2024
a60bff9
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 15, 2024
85f0ef3
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 15, 2024
2c07642
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 15, 2024
4bcb89b
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 15, 2024
927bb09
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 15, 2024
5e651eb
Merge branch 'main' into unavailable-resources-management
bolinocroustibat Nov 20, 2024
6a9a7ea
fix: fix calculate_next_check type bug issue
bolinocroustibat Nov 22, 2024
abc45b6
fix tests
bolinocroustibat Nov 22, 2024
8e7ec6f
fix: fix nasty bug in preprocess_check_data
bolinocroustibat Nov 22, 2024
0e89128
fix: fix select resource when last check has no next_check planned
bolinocroustibat Nov 22, 2024
cc69099
fix: fix select batch
bolinocroustibat Nov 22, 2024
d1590cb
tests: add tests
bolinocroustibat Nov 22, 2024
d58d4af
tests: improve tests
bolinocroustibat Nov 22, 2024
9e0f626
feat: check get_all and get_latest methods return next_check_at
bolinocroustibat Nov 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
- Delete resource as a CRUD method [#161](https://github.com/datagouv/hydra/pull/161)
- Refactor routes URLs to be more RESTful and separate legacy routes code from new routes code [#132](https://github.com/datagouv/hydra/pull/132)
- Display app version and environment in health check endpoint [#164](https://github.com/datagouv/hydra/pull/164)
- Improve unavailable resources management [#163](https://github.com/datagouv/hydra/pull/163)

## 1.0.1 (2023-01-04)

Expand Down
11 changes: 8 additions & 3 deletions udata_hydra/config_default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ NO_BACKOFF_DOMAINS = [
BACKOFF_NB_REQ = 180
BACKOFF_PERIOD = 360 # in seconds
COOL_OFF_PERIOD = 86400 # 1 day to cool off when we've messed up

# ids of the resources that are too large to be processed normally
# but that we want to have anyway
LARGE_RESOURCES_EXCEPTIONS = [
Expand All @@ -42,13 +43,17 @@ LARGE_RESOURCES_EXCEPTIONS = [
"4babf5f2-6a9c-45b5-9144-ca5eae6a7a6d",
]

# crawl batch size, beware of open file limits
# check batch size, beware of open file limits
# ⚠️ do not exceed MAX_POOL_SIZE
BATCH_SIZE = 40
# crawl url if last check is older than
SINCE = "1w"

# check resource if last check is older than
CHECK_DELAYS = ["12 hours", "1 day", "7 days", "30 days"] # must be a list of strings, postgres interval syntax
CHECK_DELAY_DEFAULT = "1w"
bolinocroustibat marked this conversation as resolved.
Show resolved Hide resolved

# seconds to wait for between batches
SLEEP_BETWEEN_BATCHES = 60

# max download filesize in bytes (100 MB)
MAX_FILESIZE_ALLOWED.csv = 104857600
MAX_FILESIZE_ALLOWED.csvgz = 104857600
Expand Down
69 changes: 33 additions & 36 deletions udata_hydra/crawl/select_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,10 @@
from humanfriendly import parse_timespan

from udata_hydra import config, context
from udata_hydra.db.check import Check
from udata_hydra.db.resource import Resource


async def select_rows_based_on_query(connection, q: str, *args) -> list[Record]:
"""
A transaction wrapper around a select query q pass as param with *args.
It first creates a temporary table based on this query,
then fetches the selected rows and drops the temporary table.
It finally returns the selected rows.
"""
temporary_table = "check_urls"
create_temp_select_table_query = (
f"""CREATE TEMPORARY TABLE {temporary_table} AS {q} FOR UPDATE;"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The temporary table was used to make an atomic ["select" and "mark as crawling"] to allow for multi crawlers. See #68.
Somehow with the status refactoring, we had already drop the await connection.execute(update_select_catalog_query).

Thus I don't think this temporary table is useful today, but we're probably not multi-crawlers safe either anymore :p
Since we don't have multiple crawlers used yet, this can be checked in a distinct PR (since it's not introduced here)

)
async with connection.transaction():
await connection.execute("BEGIN;")
await connection.execute(create_temp_select_table_query, *args)
to_check = await connection.fetch(f"SELECT * FROM {temporary_table};")
await connection.execute("COMMIT;")
await connection.execute(f"DROP TABLE {temporary_table};")
return to_check


async def select_batch_resources_to_check() -> list[Record]:
"""Select a batch of resources to check from the catalog
- It first selects resources with priority=True
Expand All @@ -49,7 +30,7 @@ async def select_batch_resources_to_check() -> list[Record]:
) s
ORDER BY random() LIMIT {config.BATCH_SIZE}
"""
to_check: list[Record] = await select_rows_based_on_query(connection, q)
to_check: list[Record] = await connection.fetch(q)

# then urls without checks
if len(to_check) < config.BATCH_SIZE:
Expand All @@ -63,25 +44,41 @@ async def select_batch_resources_to_check() -> list[Record]:
) s
ORDER BY random() LIMIT {config.BATCH_SIZE}
"""
to_check += await select_rows_based_on_query(connection, q)
to_check += await connection.fetch(q)

# if not enough for our batch size, handle outdated checks
if len(to_check) < config.BATCH_SIZE:
since = parse_timespan(config.SINCE) # in seconds
since = datetime.now(timezone.utc) - timedelta(seconds=since)
limit = config.BATCH_SIZE - len(to_check)
q = f"""
SELECT * FROM (
SELECT catalog.url, dataset_id, catalog.resource_id
FROM catalog, checks
WHERE catalog.last_check IS NOT NULL
AND {excluded}
AND catalog.last_check = checks.id
AND checks.created_at <= $1
AND catalog.priority = False
) s
ORDER BY random() LIMIT {limit}

# Base query parts
query_start = f"""
SELECT * FROM (
SELECT catalog.url, dataset_id, catalog.resource_id
FROM catalog
JOIN checks ON catalog.last_check = checks.id
WHERE (
(checks.detected_last_modified_at IS NULL AND checks.created_at < CURRENT_DATE - INTERVAL '{config.CHECK_DELAY_DEFAULT}')
OR
(checks.detected_last_modified_at IS NOT NULL AND (
"""

# Construct the dynamic part of the query
dynamic_conditions = " OR ".join(
f"(checks.created_at >= checks.detected_last_modified_at + INTERVAL '{delay}' AND checks.created_at < CURRENT_DATE - INTERVAL '{delay}')"
for delay in config.CHECK_DELAYS
)

query_end = f"""
))
)
AND catalog.priority = False
) s
ORDER BY random() LIMIT {limit};
"""
to_check += await select_rows_based_on_query(connection, q, since)

# Combine all parts to form the final query
final_query = f"{query_start} {dynamic_conditions} {query_end}"

to_check += await connection.fetch(final_query)

return to_check