diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml
index 13d9643090..169a6fb7bc 100644
--- a/.github/ISSUE_TEMPLATE/bug.yml
+++ b/.github/ISSUE_TEMPLATE/bug.yml
@@ -1,5 +1,5 @@
name: Bug Report
-description: If something isn't working as expected and you're sure your issue is reproducible, please file this type of issue!
+description: Report a demonstrable problem caused by code in this repo.
title: "[Bug]: "
labels: ["bug"]
body:
@@ -7,8 +7,8 @@ body:
- type: input
id: version
attributes:
- label: Browsertrix Cloud Version
- description: This can be found in the site footer
+ label: Browsertrix Version
+ description: This can be found in the bottom end of the Browsertrix web app.
placeholder: "v1.5.0-beta.0-67d0c6a"
validations:
required: true
@@ -17,27 +17,38 @@ body:
attributes:
label: What did you expect to happen? What happened instead?
description: |
- "I was trying to modify the Page Load Timeout value in a saved workflow, however..."
-
- Please submit any screenshots/videos that can be used to understand how to reproduce the issue. You can attach images by clicking this area to highlight it and then dragging files into the browser window.
+ A clear and concise description of the bug, and what you expected to happen instead.
- If your problem is related to crawling, or something wasn't captured in the way you expect please include a link to the finished crawl/workflow if possible.
+ For issues related to crawling or replay, please include a link to the archived item and workflow when possible.
validations:
required: true
# Step-by-step reproduction instructions
- type: textarea
attributes:
- label: Step-by-step reproduction instructions
+ label: Reproduction instructions
+ description: Step-by-step description of how to reproduce the issue, including the page URL if applicable.
placeholder: |
1. Navigate to...
2. Click on...
3. See error...
validations:
required: true
+ # Screenshots / videos
+ - type: textarea
+ attributes:
+ label: Screenshots / Video
+ description: Please attach any screenshots or screen recordings that demonstrate the bug. You can attach images by clicking this area to highlight it and then dragging files into the browser window.
+ # Environment
+ - type: input
+ attributes:
+ label: Environment
+ description: Please specify your browser if the issue is related to the web app, and provide information on your operating system if you're running Browsertrix locally.
+ placeholder: |
+ Browser:
+ Browser version:
+ OS:
# Additional details
- type: textarea
attributes:
label: Additional details
- description: Add any other relevant information here, such as your local environment if you are running Browsertrix Cloud locally.
- validations:
- required: false
+ description: Any additional context that helps us investigate the issue. For example, does the issue only happen in a specific browser? Are there forum discussions related to your issue? etc.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 4eda5b88ba..f96130dd16 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -3,14 +3,14 @@ blank_issues_enabled: true
contact_links:
- name: Report a replay issue
- about: Issues related to archived content not displaying properly should be reported in the ReplayWeb.page repo.
+ about: Issues related to an archived item or collection not replaying properly should be reported in the ReplayWeb.page repo.
url: https://github.com/webrecorder/replayweb.page/issues/new?&labels=replay+bug%2Cbug&projects=&template=replay-bug.yml&title=[Replay+Bug]%3A+
- name: Report a security vulnerability
- about: Please do not file an issue and instead email security@webrecorder.org. We will follow up with you there!
+ about: Please email security@webrecorder.org directly. We will follow up with you there!
url: https://webrecorder.net/.well-known/security.txt
- name: Get help on our forum
url: https://forum.webrecorder.net/
about: Have a ("how do I...?") question? Not sure if your issue is reproducible? The best way to get help is on our community forum!
- name: Check out the docs
url: https://docs.browsertrix.cloud
- about: Solutions to common questions may be available in the documentation!
+ about: Find solutions to common questions, such as how to install, develop, and deploy Browsertrix.
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/feature-change.yml b/.github/ISSUE_TEMPLATE/feature-change.yml
index 2f171ebab3..8ce9df44fa 100644
--- a/.github/ISSUE_TEMPLATE/feature-change.yml
+++ b/.github/ISSUE_TEMPLATE/feature-change.yml
@@ -1,49 +1,20 @@
name: Feature / Change Request
-description: If new things should be added or something that is working as intended should be changed, please file this type of issue!
+description: Request a new feature or change to an existing feature of the app.
title: "[Feature]: "
labels: ["enhancement"]
body:
- # Context
- - type: textarea
- attributes:
- label: Context
- description: Describe any prior information that we are taking into account to inform this future development.
- placeholder: "Now that x is done we should do y to accomplish z."
- validations:
- required: true
# User story sentence
- type: textarea
attributes:
label: What change would you like to see?
- description: Describe the solution you'd like. If relevant, include ways in which you've tried to solve the issue with the current version.
- placeholder: "As a user, I want to be able to ____ so that I can ____"
+ description: Describe the improvement or feature you'd like added to Browsertrix.
+ placeholder: I would like to be able to ____________ so that I can ____________.
validations:
required: true
- # Requirements
- - type: textarea
- attributes:
- label: Requirements
- description: |
- Intended primarily for use by Webrecorder team, leave blank if unknown.
-
- List the outcomes of the feature being implemented without design or implementation details.
- placeholder: |
- 1. Item metadata should show links to the collections that the item belongs to.
- 2. Items can be added or removed from collections when editing an item.
- validations:
- required: false
- # Todo
+ # Context
- type: textarea
attributes:
- label: Todo
- description: |
- Intended primarily for use by Webrecorder team, leave blank if unknown.
-
- Any other linked issues / tasks to complete to implement this feature.
- placeholder: |
- - [ ] Mockups:
- - [ ] Design:
- - [ ] UI:
- - [ ] API:
+ label: Context
+ description: Any background information that helps us understand the request.
validations:
- required: false
+ required: true
\ No newline at end of file
diff --git a/.github/workflows/deploy-dev.yaml b/.github/workflows/deploy-dev.yaml
index 6d21a6604d..5f6261b84a 100644
--- a/.github/workflows/deploy-dev.yaml
+++ b/.github/workflows/deploy-dev.yaml
@@ -15,20 +15,45 @@ jobs:
with:
driver-opts: network=host
- - name: Login to Regsitry
+ - name: Login to Registry
uses: docker/login-action@v2
with:
registry: ${{ secrets.DO_REGISTRY }}
username: ${{ secrets.DO_API_TOKEN }}
password: ${{ secrets.DO_API_TOKEN }}
- -
- name: Set Env Vars
+ - name: Set Env Vars
run: |
echo VERSION=`cat version.txt` >> $GITHUB_ENV
echo GIT_COMMIT_HASH=`git rev-parse --short HEAD` >> $GITHUB_ENV
echo GIT_BRANCH_NAME=`git rev-parse --abbrev-ref HEAD` >> $GITHUB_ENV
+ - name: Checkout values file from ops repo
+ uses: actions/checkout@v4
+ with:
+ repository: "webrecorder/browsertrix-cloud-ops"
+ path: "browsertrix-cloud-ops"
+ ssh-key: ${{ secrets.DEPLOY_KEY_OPS_REPO }}
+ sparse-checkout: |
+ scripts/decrypt-values.py
+ values/btrix-dev-values.yml
+ poetry.lock
+ pyproject.toml
+ sparse-checkout-cone-mode: false
+
+ - name: Install poetry
+ run: pipx install poetry
+
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.11"
+ cache: "poetry"
+
+ - name: Install vault decryption dependencies
+ working-directory: browsertrix-cloud-ops
+ run: |
+ poetry install
+
- name: Build Backend
uses: docker/build-push-action@v3
with:
@@ -57,12 +82,10 @@ jobs:
- name: Get Kubeconfig
env:
KUBECONFIG_DATA: ${{ secrets.KUBECONFIG_DATA }}
- DEV_VALUES: ${{ secrets.DEV_VALUES }}
run: |
printf "$KUBECONFIG_DATA" >> ./.kubeconfig
chmod 400 ./.kubeconfig
- printf "$DEV_VALUES" >> ./dev-values.yaml
- name: Install Kubectl
uses: azure/setup-kubectl@v3
@@ -72,6 +95,13 @@ jobs:
with:
version: 3.10.2
+ - name: Decrypt values file
+ env:
+ ANSIBLE_VAULT_PASSWORD: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
+ working-directory: browsertrix-cloud-ops
+ run: |
+ poetry run python scripts/decrypt-values.py values/btrix-dev-values.yml ../dev-values.yaml
+
- name: Start Cluster with Helm
run: |
KUBECONFIG=./.kubeconfig helm upgrade --install -f ./chart/values.yaml -f ./dev-values.yaml btrix ./chart/
diff --git a/.github/workflows/frontend-build-check.yaml b/.github/workflows/frontend-build-check.yaml
index da93d7ff1a..0d4bf7d684 100644
--- a/.github/workflows/frontend-build-check.yaml
+++ b/.github/workflows/frontend-build-check.yaml
@@ -15,7 +15,7 @@ jobs:
- name: Setup Node
uses: actions/setup-node@v3
with:
- node-version: '16'
+ node-version: '18'
cache: 'yarn'
cache-dependency-path: frontend/yarn.lock
- name: Restore cache
@@ -30,6 +30,14 @@ jobs:
env:
HUSKY: 0
run: yarn install --frozen-lockfile
+ - name: Lint
+ working-directory: frontend
+ run: yarn lint:check
+ - name: Format
+ working-directory: frontend
+ # TODO Reenable when https://github.com/webrecorder/browsertrix-cloud/issues/1618 is addressed
+ # run: yarn format:check
+ run: yarn prettier --list-different .
- name: Unit tests
working-directory: frontend
run: yarn test
diff --git a/.github/workflows/publish-helm-chart.yaml b/.github/workflows/publish-helm-chart.yaml
index e7e810819e..4f9e3ee814 100644
--- a/.github/workflows/publish-helm-chart.yaml
+++ b/.github/workflows/publish-helm-chart.yaml
@@ -4,6 +4,7 @@ on:
push:
branches:
- main
+ - "*-release"
jobs:
package_chart:
diff --git a/.github/workflows/ui-tests-playwright.yml b/.github/workflows/ui-tests-playwright.yml
index 4fd201b8b2..4ae8484b8b 100644
--- a/.github/workflows/ui-tests-playwright.yml
+++ b/.github/workflows/ui-tests-playwright.yml
@@ -22,7 +22,7 @@ jobs:
- name: Setup Node
uses: actions/setup-node@v3
with:
- node-version: '16'
+ node-version: '18'
cache: 'yarn'
cache-dependency-path: frontend/yarn.lock
- name: Install dependencies
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 72a5ae839b..1d83ba443b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/psf/black
- rev: 22.12.0
+ rev: 24.1.1
hooks:
- id: black
args: ["backend/btrixcloud/"]
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
index ede834d5bd..58e22b1982 100644
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -2,6 +2,7 @@
"recommendations": [
"dbaeumer.vscode-eslint",
"esbenp.prettier-vscode",
+ "dbaeumer.vscode-eslint",
"runem.lit-plugin",
"bradlc.vscode-tailwindcss",
"redhat.vscode-yaml",
diff --git a/.vscode/settings.json b/.vscode/settings.json
index e2ee7ebcc5..ae2e75fbc0 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -20,12 +20,16 @@
"Browsertrix",
"btrix",
"Elems",
+ "favicons",
"hoverable",
+ "micromark",
"novnc",
"profileid",
"tailwindcss",
"wacz",
"Webrecorder",
+ "wysimark",
+ "xstate",
"zxcvbn"
],
"cSpell.languageSettings": [
@@ -41,5 +45,8 @@
"css"
]
}
- ]
+ ],
+ "eslint.workingDirectories": ["./frontend"],
+ "eslint.nodePath": "./frontend/node_modules",
+ "tailwindCSS.experimental.classRegex": ["tw`([^`]*)"]
}
diff --git a/README.md b/README.md
index fa086b3f68..4debb745d5 100644
--- a/README.md
+++ b/README.md
@@ -1,45 +1,42 @@
-# Browsertrix Cloud
+
+
+
-
+
-Browsertrix Cloud is an open-source cloud-native high-fidelity browser-based crawling service designed
+Browsertrix is an open-source cloud-native high-fidelity browser-based crawling service designed
to make web archiving easier and more accessible for everyone.
-The service provides an API and UI for scheduling crawls and viewing results,
-and managing all aspects of crawling process. This system provides the orchestration and management around crawling,
-while the actual crawling is performed using
-[Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) containers, which are launched for each crawl.
+The service provides an API and UI for scheduling crawls and viewing results, and managing all aspects of crawling process. This system provides the orchestration and management around crawling, while the actual crawling is performed using [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) containers, which are launched for each crawl.
-See [Browsertrix Cloud](https://browsertrix.com) for a feature overview and information about Browsertrix Cloud Hosting.
+See [browsertrix.com](https://browsertrix.com) for a feature overview and information about Browsertrix hosting.
## Documentation
-The full docs for using, deploying and developing Browsertrix Cloud are available at: [https://docs.browsertrix.cloud](https://docs.browsertrix.cloud)
+The full docs for using, deploying, and developing Browsertrix are available at: [https://docs.browsertrix.cloud](https://docs.browsertrix.cloud)
-## Deployment
+## Deployment
The latest deployment documentation is available at: [https://docs.browsertrix.cloud/deploy](https://docs.browsertrix.cloud/deploy)
-The docs cover deploying Browsertrix Cloud in different environments using Kubernetes, from a single-node setup to scalable clusters in the cloud.
+The docs cover deploying Browsertrix in different environments using Kubernetes, from a single-node setup to scalable clusters in the cloud.
-Previously, Browsertrix Cloud also supported Docker Compose and podman-based deployment. This is now deprecated due to the complexity
-of maintaining feature parity across different setups, and with various Kubernetes deployment options being available and easy to deploy, even on a single machine.
+Previously, Browsertrix also supported Docker Compose and podman-based deployment. This has been deprecated due to the complexity of maintaining feature parity across different setups, and with various Kubernetes deployment options being available and easy to deploy, even on a single machine.
-Making deployment of Browsertrix Cloud as easy as possible remains a key goal, and we welcome suggestions for how we can further improve our Kubernetes deployment options.
+Making deployment of Browsertrix as easy as possible remains a key goal, and we welcome suggestions for how we can further improve our Kubernetes deployment options.
If you are looking to just try running a single crawl, you may want to try [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) first to test out the crawling capabilities.
## Development Status
-Browsertrix Cloud is currently in a beta, though the system and backend API is fairly stable, we are working on many additional features.
+Browsertrix is currently in a beta, though the system and backend API is fairly stable, we are working on many additional features.
Additional developer documentation is available at [https://docs.browsertrix.cloud/develop](https://docs.browsertrix.cloud/develop/)
Please see the GitHub issues and [this GitHub Project](https://github.com/orgs/webrecorder/projects/9) for our current project plan and tasks.
-
## License
-Browsertrix Cloud is made available under the AGPLv3 License.
+Browsertrix is made available under the AGPLv3 License.
Documentation is made available under the Creative Commons Attribution 4.0 International License
diff --git a/ansible/Pipfile.lock b/ansible/Pipfile.lock
index e811d1d2e8..486f3aae20 100644
--- a/ansible/Pipfile.lock
+++ b/ansible/Pipfile.lock
@@ -140,72 +140,61 @@
},
"cffi": {
"hashes": [
- "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5",
- "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef",
- "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104",
- "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426",
- "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405",
- "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375",
- "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a",
- "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e",
- "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc",
- "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf",
- "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185",
- "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497",
- "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3",
- "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35",
- "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c",
- "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83",
- "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21",
- "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca",
- "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984",
- "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac",
- "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd",
- "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee",
- "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a",
- "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2",
- "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192",
- "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7",
- "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585",
- "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f",
- "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e",
- "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27",
- "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b",
- "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e",
- "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e",
- "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d",
- "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c",
- "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415",
- "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82",
- "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02",
- "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314",
- "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325",
- "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c",
- "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3",
- "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914",
- "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045",
- "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d",
- "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9",
- "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5",
- "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2",
- "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c",
- "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3",
- "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2",
- "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8",
- "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d",
- "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d",
- "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9",
- "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162",
- "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76",
- "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4",
- "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e",
- "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9",
- "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6",
- "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b",
- "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01",
- "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"
- ],
- "version": "==1.15.1"
+ "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc",
+ "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a",
+ "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417",
+ "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab",
+ "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520",
+ "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36",
+ "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743",
+ "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8",
+ "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed",
+ "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684",
+ "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56",
+ "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324",
+ "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d",
+ "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235",
+ "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e",
+ "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088",
+ "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000",
+ "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7",
+ "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e",
+ "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673",
+ "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c",
+ "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe",
+ "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2",
+ "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098",
+ "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8",
+ "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a",
+ "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0",
+ "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b",
+ "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896",
+ "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e",
+ "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9",
+ "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2",
+ "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b",
+ "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6",
+ "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404",
+ "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f",
+ "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0",
+ "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4",
+ "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc",
+ "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936",
+ "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba",
+ "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872",
+ "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb",
+ "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614",
+ "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1",
+ "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d",
+ "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969",
+ "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b",
+ "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4",
+ "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627",
+ "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956",
+ "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"
+ ],
+ "markers": "platform_python_implementation != 'PyPy'",
+ "version": "==1.16.0"
},
"chardet": {
"hashes": [
@@ -329,28 +318,42 @@
},
"cryptography": {
"hashes": [
- "sha256:059e348f9a3c1950937e1b5d7ba1f8e968508ab181e75fc32b879452f08356db",
- "sha256:1a5472d40c8f8e91ff7a3d8ac6dfa363d8e3138b961529c996f3e2df0c7a411a",
- "sha256:1a8e6c2de6fbbcc5e14fd27fb24414507cb3333198ea9ab1258d916f00bc3039",
- "sha256:1fee5aacc7367487b4e22484d3c7e547992ed726d14864ee33c0176ae43b0d7c",
- "sha256:5d092fdfedaec4cbbffbf98cddc915ba145313a6fdaab83c6e67f4e6c218e6f3",
- "sha256:5f0ff6e18d13a3de56f609dd1fd11470918f770c6bd5d00d632076c727d35485",
- "sha256:7bfc55a5eae8b86a287747053140ba221afc65eb06207bedf6e019b8934b477c",
- "sha256:7fa01527046ca5facdf973eef2535a27fec4cb651e4daec4d043ef63f6ecd4ca",
- "sha256:8dde71c4169ec5ccc1087bb7521d54251c016f126f922ab2dfe6649170a3b8c5",
- "sha256:8f4ab7021127a9b4323537300a2acfb450124b2def3756f64dc3a3d2160ee4b5",
- "sha256:948224d76c4b6457349d47c0c98657557f429b4e93057cf5a2f71d603e2fc3a3",
- "sha256:9a6c7a3c87d595608a39980ebaa04d5a37f94024c9f24eb7d10262b92f739ddb",
- "sha256:b46e37db3cc267b4dea1f56da7346c9727e1209aa98487179ee8ebed09d21e43",
- "sha256:b4ceb5324b998ce2003bc17d519080b4ec8d5b7b70794cbd2836101406a9be31",
- "sha256:cb33ccf15e89f7ed89b235cff9d49e2e62c6c981a6061c9c8bb47ed7951190bc",
- "sha256:d198820aba55660b4d74f7b5fd1f17db3aa5eb3e6893b0a41b75e84e4f9e0e4b",
- "sha256:d34579085401d3f49762d2f7d6634d6b6c2ae1242202e860f4d26b046e3a1006",
- "sha256:eb8163f5e549a22888c18b0d53d6bb62a20510060a22fd5a995ec8a05268df8a",
- "sha256:f73bff05db2a3e5974a6fd248af2566134d8981fd7ab012e5dd4ddb1d9a70699"
+ "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b",
+ "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce",
+ "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88",
+ "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7",
+ "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20",
+ "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9",
+ "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff",
+ "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1",
+ "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764",
+ "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b",
+ "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298",
+ "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1",
+ "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824",
+ "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257",
+ "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a",
+ "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129",
+ "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb",
+ "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929",
+ "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854",
+ "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52",
+ "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923",
+ "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885",
+ "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0",
+ "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd",
+ "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2",
+ "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18",
+ "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b",
+ "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992",
+ "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74",
+ "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660",
+ "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925",
+ "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"
],
+ "index": "pypi",
"markers": "python_version >= '3.7'",
- "version": "==41.0.1"
+ "version": "==42.0.4"
},
"distro": {
"hashes": [
diff --git a/assets/browsertrix-lockup-color-dynamic.svg b/assets/browsertrix-lockup-color-dynamic.svg
new file mode 100644
index 0000000000..81b7e90425
--- /dev/null
+++ b/assets/browsertrix-lockup-color-dynamic.svg
@@ -0,0 +1,17 @@
+
+
+
diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py
index c1ebc9c0ea..b8cd420ea5 100644
--- a/backend/btrixcloud/background_jobs.py
+++ b/backend/btrixcloud/background_jobs.py
@@ -1,4 +1,5 @@
"""k8s background jobs"""
+
import asyncio
from datetime import datetime
from typing import Optional, Tuple, Union, List, Dict, TYPE_CHECKING, cast
@@ -402,11 +403,11 @@ async def get_replica_job_file(
profile = await self.profile_ops.get_profile(UUID(job.object_id), org)
return BaseFile(**profile.resource.dict())
- item_res = await self.base_crawl_ops.get_crawl_raw(job.object_id, org)
- matching_file = [
- f for f in item_res.get("files", []) if f["filename"] == job.file_path
- ][0]
- return BaseFile(**matching_file)
+ item_res = await self.base_crawl_ops.get_base_crawl(job.object_id, org)
+ matching_file = [f for f in item_res.files if f.filename == job.file_path][
+ 0
+ ]
+ return matching_file
# pylint: disable=broad-exception-caught, raise-missing-from
except Exception:
raise HTTPException(status_code=404, detail="file_not_found")
diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
index af4f1c6720..67e6d96d41 100644
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@@ -2,10 +2,9 @@
import os
from datetime import timedelta
-from typing import Optional, List, Union, Type, TYPE_CHECKING
+from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast
from uuid import UUID
import urllib.parse
-import contextlib
import asyncio
from fastapi import HTTPException, Depends
@@ -30,17 +29,17 @@
if TYPE_CHECKING:
from .crawlconfigs import CrawlConfigOps
- from .crawlmanager import CrawlManager
from .users import UserManager
from .orgs import OrgOps
from .colls import CollectionOps
from .storages import StorageOps
from .webhooks import EventWebhookOps
from .background_jobs import BackgroundJobOps
+ from .pages import PageOps
else:
- CrawlConfigOps = UserManager = OrgOps = CollectionOps = object
- CrawlManager = StorageOps = EventWebhookOps = BackgroundJobOps = object
+ CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object
+ StorageOps = EventWebhookOps = BackgroundJobOps = object
# Presign duration must be less than 604800 seconds (one week),
# so set this one minute short of a week.
@@ -56,20 +55,21 @@ class BaseCrawlOps:
# pylint: disable=duplicate-code, too-many-arguments, too-many-locals
crawl_configs: CrawlConfigOps
- crawl_manager: CrawlManager
user_manager: UserManager
orgs: OrgOps
colls: CollectionOps
storage_ops: StorageOps
event_webhook_ops: EventWebhookOps
background_job_ops: BackgroundJobOps
+ page_ops: PageOps
+
+ presign_duration: int
def __init__(
self,
mdb,
users: UserManager,
orgs: OrgOps,
- crawl_manager: CrawlManager,
crawl_configs: CrawlConfigOps,
colls: CollectionOps,
storage_ops: StorageOps,
@@ -77,7 +77,6 @@ def __init__(
background_job_ops: BackgroundJobOps,
):
self.crawls = mdb["crawls"]
- self.crawl_manager = crawl_manager
self.crawl_configs = crawl_configs
self.user_manager = users
self.orgs = orgs
@@ -85,6 +84,7 @@ def __init__(
self.storage_ops = storage_ops
self.event_webhook_ops = event_webhook_ops
self.background_job_ops = background_job_ops
+ self.page_ops = cast(PageOps, None)
presign_duration_minutes = int(
os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT
@@ -94,13 +94,17 @@ def __init__(
min(presign_duration_minutes, PRESIGN_MINUTES_MAX) * 60
)
+ def set_page_ops(self, page_ops):
+ """set page ops reference"""
+ self.page_ops = page_ops
+
async def get_crawl_raw(
self,
crawlid: str,
org: Optional[Organization] = None,
type_: Optional[str] = None,
project: Optional[dict[str, bool]] = None,
- ):
+ ) -> Dict[str, Any]:
"""Get data for single crawl"""
query: dict[str, object] = {"_id": crawlid}
@@ -117,40 +121,61 @@ async def get_crawl_raw(
return res
- async def _files_to_resources(self, files, org, crawlid):
+ async def _files_to_resources(
+ self,
+ files: List[Dict],
+ org: Organization,
+ crawlid: str,
+ qa_run_id: Optional[str] = None,
+ ) -> List[CrawlFileOut]:
if not files:
return []
crawl_files = [CrawlFile(**data) for data in files]
- return await self.resolve_signed_urls(crawl_files, org, crawl_id=crawlid)
+ return await self._resolve_signed_urls(crawl_files, org, crawlid, qa_run_id)
+
+ async def get_wacz_files(self, crawl_id: str, org: Organization):
+ """Return list of WACZ files associated with crawl."""
+ wacz_files = []
+ crawl = await self.get_base_crawl(crawl_id, org)
+ for file_ in crawl.files:
+ if file_.filename.endswith(".wacz"):
+ wacz_files.append(file_)
+ return wacz_files
- async def get_crawl(
+ async def get_base_crawl(
self,
crawlid: str,
org: Optional[Organization] = None,
type_: Optional[str] = None,
- cls_type: Type[Union[CrawlOut, CrawlOutWithResources]] = CrawlOutWithResources,
- ):
- """Get data for single base crawl"""
- res = await self.get_crawl_raw(crawlid, org, type_)
-
- if cls_type == CrawlOutWithResources:
- res["resources"] = await self._files_to_resources(
- res.get("files"), org, crawlid
- )
+ project: Optional[dict[str, bool]] = None,
+ ) -> BaseCrawl:
+ """Get crawl data for internal use"""
+ res = await self.get_crawl_raw(crawlid, org, type_, project)
+ return BaseCrawl.from_dict(res)
- if res.get("collectionIds"):
- res["collections"] = await self.colls.get_collection_names(
- res.get("collectionIds")
- )
+ async def get_crawl_out(
+ self,
+ crawlid: str,
+ org: Optional[Organization] = None,
+ type_: Optional[str] = None,
+ skip_resources=False,
+ ) -> CrawlOutWithResources:
+ """Get crawl data for api output"""
+ res = await self.get_crawl_raw(crawlid, org, type_)
- res.pop("files", None)
+ files = res.pop("files", None)
res.pop("errors", None)
- crawl = cls_type.from_dict(res)
+ if not skip_resources:
+ coll_ids = res.get("collectionIds")
+ if coll_ids:
+ res["collections"] = await self.colls.get_collection_names(coll_ids)
+
+ crawl = CrawlOutWithResources.from_dict(res)
- if crawl.type == "crawl":
- crawl = await self._resolve_crawl_refs(crawl, org)
+ if not skip_resources:
+ crawl = await self._resolve_crawl_refs(crawl, org, files)
if crawl.config and crawl.config.seeds:
crawl.config.seeds = None
@@ -161,23 +186,22 @@ async def get_crawl(
return crawl
- async def get_resource_resolved_raw_crawl(
- self, crawlid: str, org: Organization, type_=None
- ):
- """return single base crawl with resources resolved"""
- res = await self.get_crawl_raw(crawlid=crawlid, type_=type_, org=org)
- res["resources"] = await self._files_to_resources(
- res.get("files"), org, res["_id"]
- )
- return res
+ async def get_internal_crawl_out(self, crawl_id):
+ """add internal prefix for relative paths"""
+ crawl_out = await self.get_crawl_out(crawl_id)
+ resources = crawl_out.resources or []
+ for file_ in resources:
+ file_.path = self.storage_ops.resolve_internal_access_path(file_.path)
+
+ return crawl_out
async def _update_crawl_collections(
self, crawl_id: str, org: Organization, collection_ids: List[UUID]
):
"""Update crawl collections to match updated list."""
- crawl = await self.get_crawl(crawl_id, org, cls_type=CrawlOut)
+ crawl = await self.get_crawl_out(crawl_id, org, skip_resources=True)
- prior_coll_ids = set(crawl.collectionIds)
+ prior_coll_ids = set(crawl.collectionIds or [])
updated_coll_ids = set(collection_ids)
# Add new collections
@@ -257,50 +281,7 @@ async def add_crawl_file_replica(
)
async def shutdown_crawl(self, crawl_id: str, org: Organization, graceful: bool):
- """stop or cancel specified crawl"""
- crawl = await self.get_crawl_raw(crawl_id, org)
- if crawl.get("type") != "crawl":
- return
-
- result = None
- try:
- result = await self.crawl_manager.shutdown_crawl(
- crawl_id, graceful=graceful
- )
-
- if result.get("success"):
- if graceful:
- await self.crawls.find_one_and_update(
- {"_id": crawl_id, "type": "crawl", "oid": org.id},
- {"$set": {"stopping": True}},
- )
- return result
-
- except Exception as exc:
- # pylint: disable=raise-missing-from
- # if reached here, probably crawl doesn't exist anymore
- raise HTTPException(
- status_code=404, detail=f"crawl_not_found, (details: {exc})"
- )
-
- # if job no longer running, canceling is considered success,
- # but graceful stoppage is not possible, so would be a failure
- if result.get("error") == "Not Found":
- if not graceful:
- await self.update_crawl_state(crawl_id, "canceled")
- crawl = await self.get_crawl_raw(crawl_id, org)
- if not await self.crawl_configs.stats_recompute_last(
- crawl["cid"], 0, -1
- ):
- raise HTTPException(
- status_code=404,
- detail=f"crawl_config_not_found: {crawl['cid']}",
- )
-
- return {"success": True}
-
- # return whatever detail may be included in the response
- raise HTTPException(status_code=400, detail=result)
+ """placeholder, implemented in crawls, base version does nothing"""
async def delete_crawls(
self,
@@ -308,24 +289,24 @@ async def delete_crawls(
delete_list: DeleteCrawlList,
type_: str,
user: Optional[User] = None,
- ):
+ ) -> tuple[int, dict[UUID, dict[str, int]], bool]:
"""Delete a list of crawls by id for given org"""
- cids_to_update: dict[str, dict[str, int]] = {}
+ cids_to_update: dict[UUID, dict[str, int]] = {}
size = 0
for crawl_id in delete_list.crawl_ids:
- crawl = await self.get_crawl_raw(crawl_id, org)
- if crawl.get("type") != type_:
+ crawl = await self.get_base_crawl(crawl_id, org)
+ if crawl.type != type_:
continue
# Ensure user has appropriate permissions for all crawls in list:
# - Crawler users can delete their own crawls
# - Org owners can delete any crawls in org
- if user and (crawl.get("userid") != user.id) and not org.is_owner(user):
+ if user and (crawl.userid != user.id) and not org.is_owner(user):
raise HTTPException(status_code=403, detail="not_allowed")
- if type_ == "crawl" and not crawl.get("finished"):
+ if type_ == "crawl" and not crawl.finished:
try:
await self.shutdown_crawl(crawl_id, org, graceful=False)
except Exception as exc:
@@ -334,10 +315,13 @@ async def delete_crawls(
status_code=400, detail=f"Error Stopping Crawl: {exc}"
)
+ if type_ == "crawl":
+ await self.page_ops.delete_crawl_pages(crawl_id, org.id)
+
crawl_size = await self._delete_crawl_files(crawl, org)
size += crawl_size
- cid = crawl.get("cid")
+ cid = crawl.cid
if cid:
if cids_to_update.get(cid):
cids_to_update[cid]["inc"] += 1
@@ -367,9 +351,8 @@ async def delete_crawls(
return res.deleted_count, cids_to_update, quota_reached
- async def _delete_crawl_files(self, crawl, org: Organization):
+ async def _delete_crawl_files(self, crawl: BaseCrawl, org: Organization):
"""Delete files associated with crawl from storage."""
- crawl = BaseCrawl.from_dict(crawl)
size = 0
for file_ in crawl.files:
size += file_.size
@@ -381,12 +364,18 @@ async def _delete_crawl_files(self, crawl, org: Organization):
return size
+ async def delete_crawl_files(self, crawl_id: str, oid: UUID):
+ """Delete crawl files"""
+ crawl = await self.get_base_crawl(crawl_id)
+ org = await self.orgs.get_org_by_id(oid)
+ return await self._delete_crawl_files(crawl, org)
+
async def _resolve_crawl_refs(
self,
crawl: Union[CrawlOut, CrawlOutWithResources],
org: Optional[Organization],
+ files: Optional[list[dict]],
add_first_seed: bool = True,
- files: Optional[list[dict]] = None,
):
"""Resolve running crawl data"""
# pylint: disable=too-many-branches
@@ -395,6 +384,12 @@ async def _resolve_crawl_refs(
config = await self.crawl_configs.get_crawl_config(
crawl.cid, org.id if org else None, active_only=False
)
+
+ if not org:
+ org = await self.orgs.get_org_by_id(crawl.oid)
+ if not org:
+ raise HTTPException(status_code=400, detail="missing_org")
+
if config and config.config.seeds:
if add_first_seed:
first_seed = config.config.seeds[0]
@@ -415,17 +410,18 @@ async def _resolve_crawl_refs(
return crawl
- async def resolve_signed_urls(
+ async def _resolve_signed_urls(
self,
files: List[CrawlFile],
org: Organization,
- update_presigned_url: bool = False,
crawl_id: Optional[str] = None,
- ):
+ qa_run_id: Optional[str] = None,
+ update_presigned_url: bool = False,
+ ) -> List[CrawlFileOut]:
"""Regenerate presigned URLs for files as necessary"""
if not files:
print("no files")
- return
+ return []
delta = timedelta(seconds=self.presign_duration_seconds)
@@ -440,12 +436,17 @@ async def resolve_signed_urls(
presigned_url = await self.storage_ops.get_presigned_url(
org, file_, self.presign_duration_seconds
)
+
+ prefix = "files"
+ if qa_run_id:
+ prefix = f"qaFinished.{qa_run_id}.{prefix}"
+
await self.crawls.find_one_and_update(
- {"files.filename": file_.filename},
+ {f"{prefix}.filename": file_.filename},
{
"$set": {
- "files.$.presignedUrl": presigned_url,
- "files.$.expireAt": exp,
+ f"{prefix}.$.presignedUrl": presigned_url,
+ f"{prefix}.$.expireAt": exp,
}
},
)
@@ -470,25 +471,13 @@ async def resolve_signed_urls(
return out_files
- @contextlib.asynccontextmanager
- async def get_redis(self, crawl_id):
- """get redis url for crawl id"""
- redis_url = self.crawl_manager.get_redis_url(crawl_id)
-
- redis = await self.crawl_manager.get_redis_client(redis_url)
-
- try:
- yield redis
- finally:
- await redis.close()
-
async def add_to_collection(
self, crawl_ids: List[str], collection_id: UUID, org: Organization
):
"""Add crawls to collection."""
for crawl_id in crawl_ids:
- crawl_raw = await self.get_crawl_raw(crawl_id, org)
- crawl_collections = crawl_raw.get("collectionIds")
+ crawl = await self.get_base_crawl(crawl_id, org)
+ crawl_collections = crawl.collectionIds
if crawl_collections and crawl_id in crawl_collections:
raise HTTPException(
status_code=400, detail="crawl_already_in_collection"
@@ -638,11 +627,10 @@ async def delete_crawls_all_types(
uploads: list[str] = []
for crawl_id in delete_list.crawl_ids:
- crawl = await self.get_crawl_raw(crawl_id, org)
- type_ = crawl.get("type")
- if type_ == "crawl":
+ crawl = await self.get_base_crawl(crawl_id, org)
+ if crawl.type == "crawl":
crawls.append(crawl_id)
- if type_ == "upload":
+ if crawl.type == "upload":
uploads.append(crawl_id)
crawls_length = len(crawls)
@@ -793,7 +781,7 @@ async def get_all_crawls_search_values(
response_model=CrawlOutWithResources,
)
async def get_base_crawl(crawl_id: str, org: Organization = Depends(org_crawl_dep)):
- return await ops.get_crawl(crawl_id, org)
+ return await ops.get_crawl_out(crawl_id, org)
@app.get(
"/orgs/all/all-crawls/{crawl_id}/replay.json",
@@ -804,15 +792,15 @@ async def get_base_crawl_admin(crawl_id, user: User = Depends(user_dep)):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
- return await ops.get_crawl(crawl_id, None)
+ return await ops.get_crawl_out(crawl_id, None)
@app.get(
"/orgs/{oid}/all-crawls/{crawl_id}/replay.json",
tags=["all-crawls"],
response_model=CrawlOutWithResources,
)
- async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)):
- return await ops.get_crawl(crawl_id, org)
+ async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
+ return await ops.get_crawl_out(crawl_id, org)
@app.patch("/orgs/{oid}/all-crawls/{crawl_id}", tags=["all-crawls"])
async def update_crawl(
diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py
index aa2a05358e..6a23fe50d2 100644
--- a/backend/btrixcloud/colls.py
+++ b/backend/btrixcloud/colls.py
@@ -1,6 +1,7 @@
"""
Collections API
"""
+
from collections import Counter
from datetime import datetime
from uuid import UUID, uuid4
@@ -19,6 +20,7 @@
CollIdName,
UpdateColl,
AddRemoveCrawlList,
+ BaseCrawl,
CrawlOutWithResources,
Organization,
PaginatedResponse,
@@ -329,17 +331,18 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
total_size = 0
tags = []
- async for crawl in self.crawls.find({"collectionIds": collection_id}):
- if crawl["state"] not in SUCCESSFUL_STATES:
+ async for crawl_raw in self.crawls.find({"collectionIds": collection_id}):
+ crawl = BaseCrawl.from_dict(crawl_raw)
+ if crawl.state not in SUCCESSFUL_STATES:
continue
crawl_count += 1
- files = crawl.get("files", [])
+ files = crawl.files or []
for file in files:
- total_size += file.get("size", 0)
- if crawl.get("stats"):
- page_count += crawl.get("stats", {}).get("done", 0)
- if crawl.get("tags"):
- tags.extend(crawl.get("tags"))
+ total_size += file.size
+ if crawl.stats:
+ page_count += crawl.stats.done
+ if crawl.tags:
+ tags.extend(crawl.tags)
sorted_tags = [tag for tag, count in Counter(tags).most_common()]
diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
index fdb8e77009..3fb1d09700 100644
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@@ -1,6 +1,7 @@
"""
Crawl Config API handling
"""
+
# pylint: disable=too-many-lines
from typing import List, Union, Optional, Tuple, TYPE_CHECKING, cast
@@ -23,6 +24,7 @@
CrawlConfig,
CrawlConfigOut,
CrawlConfigIdNameOut,
+ CrawlOut,
EmptyStr,
UpdateCrawlConfig,
Organization,
@@ -32,7 +34,7 @@
CrawlerChannel,
CrawlerChannels,
)
-from .utils import dt_now
+from .utils import dt_now, slug_from_name
if TYPE_CHECKING:
from .orgs import OrgOps
@@ -231,6 +233,7 @@ async def add_crawl_config(
run_now=run_now,
out_filename=out_filename,
profile_filename=profile_filename or "",
+ warc_prefix=self.get_warc_prefix(org, crawlconfig),
)
if crawl_id and run_now:
@@ -297,6 +300,7 @@ async def readd_configmap(
run_now=False,
out_filename=self.default_filename_template,
profile_filename=profile_filename or "",
+ warc_prefix=self.get_warc_prefix(org, crawlconfig),
)
async def update_crawl_config(
@@ -556,7 +560,9 @@ async def get_crawl_config_ids_for_profile(
results = [CrawlConfigIdNameOut.from_dict(res) for res in results]
return results
- async def get_running_crawl(self, crawlconfig: CrawlConfig):
+ async def get_running_crawl(
+ self, crawlconfig: Union[CrawlConfig, CrawlConfigOut]
+ ) -> Optional[CrawlOut]:
"""Return the id of currently running crawl for this config, if any"""
# crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id)
crawls, _ = await self.crawl_ops.list_crawls(
@@ -616,13 +622,15 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
return result is not None
- def _add_curr_crawl_stats(self, crawlconfig, crawl):
+ def _add_curr_crawl_stats(
+ self, crawlconfig: CrawlConfigOut, crawl: Optional[CrawlOut]
+ ):
"""Add stats from current running crawl, if any"""
if not crawl:
return
crawlconfig.lastCrawlState = crawl.state
- crawlconfig.lastCrawlSize = crawl.stats.get("size", 0) if crawl.stats else 0
+ crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0
crawlconfig.lastCrawlStopping = crawl.stopping
async def get_crawl_config_out(self, cid: UUID, org: Organization):
@@ -811,8 +819,9 @@ async def get_crawl_config_search_values(self, org):
"workflowIds": workflow_ids,
}
- async def run_now(self, cid: UUID, org: Organization, user: User):
- """run specified crawlconfig now"""
+ async def prepare_for_run_crawl(self, cid: UUID, org: Organization) -> CrawlConfig:
+ """prepare for running a crawl, returning crawlconfig and
+ validating that running crawls is allowed"""
crawlconfig = await self.get_crawl_config(cid, org.id)
if not crawlconfig:
@@ -820,11 +829,6 @@ async def run_now(self, cid: UUID, org: Organization, user: User):
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
- if await self.get_running_crawl(crawlconfig):
- raise HTTPException(status_code=400, detail="crawl_already_running")
-
- crawl_id = None
-
# ensure crawlconfig exists
try:
await self.crawl_manager.get_configmap(crawlconfig.id)
@@ -838,9 +842,21 @@ async def run_now(self, cid: UUID, org: Organization, user: User):
if await self.org_ops.exec_mins_quota_reached(org.id):
raise HTTPException(status_code=403, detail="exec_minutes_quota_reached")
+ return crawlconfig
+
+ async def run_now(self, cid: UUID, org: Organization, user: User):
+ """run specified crawlconfig now"""
+ crawlconfig = await self.prepare_for_run_crawl(cid, org)
+
+ if await self.get_running_crawl(crawlconfig):
+ raise HTTPException(status_code=400, detail="crawl_already_running")
+
try:
crawl_id = await self.crawl_manager.create_crawl_job(
- crawlconfig, org.storage, userid=str(user.id)
+ crawlconfig,
+ org.storage,
+ userid=str(user.id),
+ warc_prefix=self.get_warc_prefix(org, crawlconfig),
)
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
return crawl_id
@@ -896,6 +912,21 @@ def get_channel_crawler_image(
"""Get crawler image name by id"""
return self.crawler_images_map.get(crawler_channel or "")
+ def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
+ """Generate WARC prefix slug from org slug, name or url
+ if no name is provided, hostname is used from url, otherwise
+ url is ignored"""
+ name = crawlconfig.name
+ if not name:
+ if crawlconfig.config.seeds and len(crawlconfig.config.seeds):
+ url = crawlconfig.config.seeds[0].url
+ parts = urllib.parse.urlsplit(url)
+ name = parts.netloc
+
+ name = slug_from_name(name or "")
+ prefix = org.slug + "-" + name
+ return prefix[:80]
+
# ============================================================================
# pylint: disable=too-many-locals
diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py
index fe455e8341..536da58274 100644
--- a/backend/btrixcloud/crawlmanager.py
+++ b/backend/btrixcloud/crawlmanager.py
@@ -96,9 +96,11 @@ async def run_replica_job(
"replica_secret_name": replica_storage.get_storage_secret_name(oid),
"replica_file_path": replica_file_path,
"replica_endpoint": replica_endpoint,
- "primary_secret_name": primary_storage.get_storage_secret_name(oid)
- if primary_storage
- else None,
+ "primary_secret_name": (
+ primary_storage.get_storage_secret_name(oid)
+ if primary_storage
+ else None
+ ),
"primary_file_path": primary_file_path if primary_file_path else None,
"primary_endpoint": primary_endpoint if primary_endpoint else None,
"BgJobType": BgJobType,
@@ -117,6 +119,7 @@ async def add_crawl_config(
run_now: bool,
out_filename: str,
profile_filename: str,
+ warc_prefix: str,
) -> Optional[str]:
"""add new crawl, store crawl config in configmap"""
@@ -137,7 +140,10 @@ async def add_crawl_config(
if run_now:
crawl_id = await self.create_crawl_job(
- crawlconfig, storage, str(crawlconfig.modifiedBy)
+ crawlconfig,
+ storage,
+ str(crawlconfig.modifiedBy),
+ warc_prefix,
)
await self._update_scheduled_job(crawlconfig)
@@ -149,6 +155,7 @@ async def create_crawl_job(
crawlconfig: CrawlConfig,
storage: StorageRef,
userid: str,
+ warc_prefix: str,
) -> str:
"""create new crawl job from config"""
cid = str(crawlconfig.id)
@@ -167,6 +174,38 @@ async def create_crawl_job(
crawlconfig.crawlTimeout,
crawlconfig.maxCrawlSize,
manual=True,
+ warc_prefix=warc_prefix,
+ )
+
+ async def create_qa_crawl_job(
+ self,
+ crawlconfig: CrawlConfig,
+ storage: StorageRef,
+ userid: str,
+ qa_source: str,
+ ) -> str:
+ """create new QA Run crawl job with qa source crawl id"""
+ cid = str(crawlconfig.id)
+
+ storage_secret = storage.get_storage_secret_name(str(crawlconfig.oid))
+
+ await self.has_storage_secret(storage_secret)
+
+ ts_now = dt_now().strftime("%Y%m%d%H%M%S")
+ crawl_id = f"qa-{ts_now}-{cid[:12]}"
+
+ return await self.new_crawl_job(
+ cid,
+ userid,
+ crawlconfig.oid,
+ storage,
+ crawlconfig.crawlerChannel,
+ 1,
+ 0,
+ 0,
+ warc_prefix="qa",
+ crawl_id=crawl_id,
+ qa_source=qa_source,
)
async def update_crawl_config(
diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
index 2d3788e97c..4711bec709 100644
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@@ -1,12 +1,15 @@
""" Crawl API """
+
# pylint: disable=too-many-lines
import json
import re
+import contextlib
import urllib.parse
+from datetime import datetime
from uuid import UUID
-from typing import Optional, List, Dict, Union
+from typing import Optional, List, Dict, Union, Any
from fastapi import Depends, HTTPException
from fastapi.responses import StreamingResponse
@@ -16,19 +19,27 @@
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .utils import dt_now, parse_jsonl_error_messages, stream_dict_list_as_csv
from .basecrawls import BaseCrawlOps
+from .crawlmanager import CrawlManager
from .models import (
UpdateCrawl,
DeleteCrawlList,
CrawlConfig,
UpdateCrawlConfig,
CrawlScale,
+ CrawlStats,
+ CrawlFile,
Crawl,
CrawlOut,
CrawlOutWithResources,
+ QARun,
+ QARunOut,
+ QARunWithResources,
+ DeleteQARunList,
Organization,
User,
PaginatedResponse,
RUNNING_AND_STARTING_STATES,
+ SUCCESSFUL_STATES,
ALL_CRAWL_STATES,
)
@@ -38,13 +49,15 @@
# ============================================================================
+# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods
class CrawlOps(BaseCrawlOps):
"""Crawl Ops"""
- # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods
+ crawl_manager: CrawlManager
- def __init__(self, *args):
+ def __init__(self, crawl_manager: CrawlManager, *args):
super().__init__(*args)
+ self.crawl_manager = crawl_manager
self.crawl_configs.set_crawl_ops(self)
self.colls.set_crawl_ops(self)
self.event_webhook_ops.set_crawl_ops(self)
@@ -75,6 +88,28 @@ async def init_index(self):
await self.crawls.create_index([("state", pymongo.HASHED)])
await self.crawls.create_index([("fileSize", pymongo.DESCENDING)])
+ async def get_crawl(
+ self,
+ crawlid: str,
+ org: Optional[Organization] = None,
+ project: Optional[dict[str, bool]] = None,
+ ) -> Crawl:
+ """Get crawl data for internal use"""
+ res = await self.get_crawl_raw(crawlid, org, "crawl", project)
+ return Crawl.from_dict(res)
+
+ @contextlib.asynccontextmanager
+ async def get_redis(self, crawl_id):
+ """get redis url for crawl id"""
+ redis_url = self.crawl_manager.get_redis_url(crawl_id)
+
+ redis = await self.crawl_manager.get_redis_client(redis_url)
+
+ try:
+ yield redis
+ finally:
+ await redis.close()
+
async def list_crawls(
self,
org: Optional[Organization] = None,
@@ -192,7 +227,7 @@ async def list_crawls(
crawl = cls.from_dict(result)
files = result.get("files") if resources else None
crawl = await self._resolve_crawl_refs(
- crawl, org, add_first_seed=False, files=files
+ crawl, org, files=files, add_first_seed=False
)
crawls.append(crawl)
@@ -221,16 +256,6 @@ async def delete_crawls(
return {"deleted": True, "storageQuotaReached": quota_reached}
- async def get_wacz_files(self, crawl_id: str, org: Organization):
- """Return list of WACZ files associated with crawl."""
- wacz_files = []
- crawl_raw = await self.get_crawl_raw(crawl_id, org)
- crawl = Crawl.from_dict(crawl_raw)
- for file_ in crawl.files:
- if file_.filename.endswith(".wacz"):
- wacz_files.append(file_)
- return wacz_files
-
# pylint: disable=too-many-arguments
async def add_new_crawl(
self,
@@ -277,16 +302,15 @@ async def add_new_crawl(
return dt_now
except pymongo.errors.DuplicateKeyError:
- # print(f"Crawl Already Added: {crawl.id} - {crawl.state}")
return None
async def update_crawl_scale(
self, crawl_id: str, org: Organization, crawl_scale: CrawlScale, user: User
):
"""Update crawl scale in the db"""
- crawl = await self.get_crawl_raw(crawl_id, org)
+ crawl = await self.get_crawl(crawl_id, org)
update = UpdateCrawlConfig(scale=crawl_scale.scale)
- await self.crawl_configs.update_crawl_config(crawl["cid"], org, user, update)
+ await self.crawl_configs.update_crawl_config(crawl.cid, org, user, update)
result = await self.crawls.find_one_and_update(
{"_id": crawl_id, "type": "crawl", "oid": org.id},
@@ -383,35 +407,15 @@ async def match_crawl_queue(self, crawl_id, regex, offset=0):
return {"total": total, "matched": matched, "nextOffset": next_offset}
- async def get_errors_from_redis(
- self, crawl_id: str, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1
- ):
- """Get crawl errors from Redis and optionally store in mongodb."""
- # Zero-index page for query
- page = page - 1
- skip = page * page_size
- upper_bound = skip + page_size - 1
-
- async with self.get_redis(crawl_id) as redis:
- try:
- errors = await redis.lrange(f"{crawl_id}:e", skip, upper_bound)
- total = await redis.llen(f"{crawl_id}:e")
- except exceptions.ConnectionError:
- # pylint: disable=raise-missing-from
- raise HTTPException(status_code=503, detail="error_logs_not_available")
-
- parsed_errors = parse_jsonl_error_messages(errors)
- return parsed_errors, total
-
async def add_or_remove_exclusion(self, crawl_id, regex, org, user, add):
"""add new exclusion to config or remove exclusion from config
for given crawl_id, update config on crawl"""
- crawl_raw = await self.get_crawl_raw(crawl_id, org, project={"cid": True})
+ crawl = await self.get_crawl(crawl_id, org, project={"cid": True})
- cid = crawl_raw.get("cid")
+ cid = crawl.cid
- scale = crawl_raw.get("scale", 1)
+ scale = crawl.scale or 1
async with self.get_redis(crawl_id) as redis:
query = {
@@ -435,50 +439,111 @@ async def add_or_remove_exclusion(self, crawl_id, regex, org, user, add):
return {"success": True}
async def update_crawl_state_if_allowed(
- self, crawl_id, state, allowed_from, **kwargs
+ self,
+ crawl_id: str,
+ is_qa: bool,
+ state: str,
+ allowed_from: List[str],
+ finished: Optional[datetime] = None,
+ stats: Optional[CrawlStats] = None,
):
"""update crawl state and other properties in db if state has changed"""
- kwargs["state"] = state
- query = {"_id": crawl_id, "type": "crawl"}
+ prefix = "" if not is_qa else "qa."
+
+ update: Dict[str, Any] = {f"{prefix}state": state}
+ if finished:
+ update[f"{prefix}finished"] = finished
+ if stats:
+ update[f"{prefix}stats"] = stats.dict()
+
+ query: Dict[str, Any] = {"_id": crawl_id, "type": "crawl"}
if allowed_from:
- query["state"] = {"$in": allowed_from}
+ query[f"{prefix}state"] = {"$in": allowed_from}
- return await self.crawls.find_one_and_update(query, {"$set": kwargs})
+ return await self.crawls.find_one_and_update(query, {"$set": update})
- async def update_running_crawl_stats(self, crawl_id, stats):
+ async def update_running_crawl_stats(
+ self, crawl_id: str, is_qa: bool, stats: CrawlStats
+ ):
"""update running crawl stats"""
- query = {"_id": crawl_id, "type": "crawl", "state": "running"}
- return await self.crawls.find_one_and_update(query, {"$set": {"stats": stats}})
+ prefix = "" if not is_qa else "qa."
+ query = {"_id": crawl_id, "type": "crawl", f"{prefix}state": "running"}
+ return await self.crawls.find_one_and_update(
+ query, {"$set": {f"{prefix}stats": stats.dict()}}
+ )
- async def inc_crawl_exec_time(self, crawl_id, exec_time):
+ async def inc_crawl_exec_time(
+ self,
+ crawl_id: str,
+ is_qa: bool,
+ exec_time,
+ last_updated_time,
+ ):
"""increment exec time"""
+ # update both crawl-shared qa exec seconds and per-qa run exec seconds
+ if is_qa:
+ inc_update = {
+ "qaCrawlExecSeconds": exec_time,
+ "qa.crawlExecSeconds": exec_time,
+ }
+ else:
+ inc_update = {"crawlExecSeconds": exec_time}
+
return await self.crawls.find_one_and_update(
- {"_id": crawl_id, "type": "crawl"},
- {"$inc": {"crawlExecSeconds": exec_time}},
+ {
+ "_id": crawl_id,
+ "type": "crawl",
+ "_lut": {"$ne": last_updated_time},
+ },
+ {
+ "$inc": inc_update,
+ "$set": {"_lut": last_updated_time},
+ },
)
- async def get_crawl_state(self, crawl_id):
+ async def get_crawl_exec_last_update_time(self, crawl_id):
+ """get crawl last updated time"""
+ res = await self.crawls.find_one(
+ {"_id": crawl_id, "type": "crawl"}, projection=["_lut"]
+ )
+ return res and res.get("_lut")
+
+ async def get_crawl_state(self, crawl_id: str, is_qa: bool):
"""return current crawl state of a crawl"""
+ prefix = "" if not is_qa else "qa."
+
res = await self.crawls.find_one(
- {"_id": crawl_id}, projection=["state", "finished"]
+ {"_id": crawl_id},
+ projection={"state": f"${prefix}state", "finished": f"${prefix}finished"},
)
if not res:
return None, None
return res.get("state"), res.get("finished")
- async def add_crawl_errors(self, crawl_id, errors):
- """add crawl errors from redis to mongodb errors field"""
+ async def add_crawl_error(
+ self,
+ crawl_id: str,
+ is_qa: bool,
+ error: str,
+ ):
+ """add crawl error from redis to mongodb errors field"""
+ prefix = "" if not is_qa else "qa."
+
await self.crawls.find_one_and_update(
- {"_id": crawl_id}, {"$push": {"errors": {"$each": errors}}}
+ {"_id": crawl_id}, {"$push": {f"{prefix}errors": error}}
)
- async def add_crawl_file(self, crawl_id, crawl_file, size):
+ async def add_crawl_file(
+ self, crawl_id: str, is_qa: bool, crawl_file: CrawlFile, size: int
+ ):
"""add new crawl file to crawl"""
+ prefix = "" if not is_qa else "qa."
+
await self.crawls.find_one_and_update(
{"_id": crawl_id},
{
- "$push": {"files": crawl_file.dict()},
- "$inc": {"fileCount": 1, "fileSize": size},
+ "$push": {f"{prefix}files": crawl_file.dict()},
+ "$inc": {f"{prefix}fileCount": 1, f"{prefix}fileSize": size},
},
)
@@ -493,9 +558,10 @@ async def get_crawl_seeds(
skip = (page - 1) * page_size
upper_bound = skip + page_size
- crawl_raw = await self.get_crawl_raw(crawl_id, org)
+ crawl = await self.get_crawl(crawl_id, org)
+ if not crawl.config or not crawl.config.seeds:
+ return [], 0
try:
- crawl = Crawl.from_dict(crawl_raw)
return crawl.config.seeds[skip:upper_bound], len(crawl.config.seeds)
# pylint: disable=broad-exception-caught
except Exception:
@@ -515,60 +581,261 @@ async def get_crawl_stats(
if org:
query["oid"] = org.id
- async for crawl in self.crawls.find(query):
+ async for crawl_raw in self.crawls.find(query):
+ crawl = Crawl.from_dict(crawl_raw)
data: Dict[str, Union[str, int]] = {}
- data["id"] = str(crawl.get("_id"))
+ data["id"] = crawl.id
- oid = crawl.get("oid")
- data["oid"] = str(oid)
- data["org"] = org_slugs[oid]
+ data["oid"] = str(crawl.oid)
+ data["org"] = org_slugs[crawl.oid]
- data["cid"] = str(crawl.get("cid"))
- crawl_name = crawl.get("name")
- data["name"] = f'"{crawl_name}"' if crawl_name else ""
- data["state"] = crawl.get("state")
+ data["cid"] = crawl.id
+ data["name"] = f'"{crawl.name}"' if crawl.name else ""
+ data["state"] = crawl.state
- userid = crawl.get("userid")
- data["userid"] = str(userid)
- data["user"] = user_emails.get(userid)
+ data["userid"] = str(crawl.userid)
+ data["user"] = user_emails.get(crawl.userid)
- started = crawl.get("started")
- finished = crawl.get("finished")
-
- data["started"] = str(started)
- data["finished"] = str(finished)
+ data["started"] = str(crawl.started)
+ data["finished"] = str(crawl.finished)
data["duration"] = 0
- if started and finished:
- duration = finished - started
+ duration_seconds = 0
+ if crawl.started and crawl.finished:
+ duration = crawl.finished - crawl.started
duration_seconds = int(duration.total_seconds())
if duration_seconds:
data["duration"] = duration_seconds
- done_stats = None
- if crawl.get("stats") and crawl.get("stats").get("done"):
- done_stats = crawl["stats"]["done"]
-
- data["pages"] = 0
- if done_stats:
- data["pages"] = done_stats
+ if crawl.stats:
+ data["pages"] = crawl.stats.done
- data["filesize"] = crawl.get("fileSize", 0)
+ data["filesize"] = crawl.fileSize
data["avg_page_time"] = 0
- if (
- done_stats
- and done_stats != 0
- and started
- and finished
- and duration_seconds
- ):
- data["avg_page_time"] = int(duration_seconds / done_stats)
+ if crawl.stats and crawl.stats.done != 0 and duration_seconds:
+ data["avg_page_time"] = int(duration_seconds / crawl.stats.done)
crawls_data.append(data)
return crawls_data
+ async def shutdown_crawl(
+ self, crawl_id: str, org: Organization, graceful: bool
+ ) -> Dict[str, bool]:
+ """stop or cancel specified crawl"""
+ crawl = await self.get_base_crawl(crawl_id, org)
+ if crawl and crawl.type != "crawl":
+ raise HTTPException(status_code=400, detail="not_a_crawl")
+
+ result = None
+ try:
+ result = await self.crawl_manager.shutdown_crawl(
+ crawl_id, graceful=graceful
+ )
+
+ if result.get("success"):
+ if graceful:
+ await self.crawls.find_one_and_update(
+ {"_id": crawl_id, "type": "crawl", "oid": org.id},
+ {"$set": {"stopping": True}},
+ )
+ return result
+
+ except Exception as exc:
+ # pylint: disable=raise-missing-from
+ # if reached here, probably crawl doesn't exist anymore
+ raise HTTPException(
+ status_code=404, detail=f"crawl_not_found, (details: {exc})"
+ )
+
+ # if job no longer running, canceling is considered success,
+ # but graceful stoppage is not possible, so would be a failure
+ if result.get("error") == "Not Found":
+ if not graceful:
+ await self.update_crawl_state(crawl_id, "canceled")
+ crawl = await self.get_crawl(crawl_id, org)
+ if not await self.crawl_configs.stats_recompute_last(crawl.cid, 0, -1):
+ raise HTTPException(
+ status_code=404,
+ detail=f"crawl_config_not_found: {crawl.cid}",
+ )
+
+ return {"success": True}
+
+ # return whatever detail may be included in the response
+ raise HTTPException(status_code=400, detail=result)
+
+ async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User):
+ """Start crawl QA run"""
+
+ crawl = await self.get_crawl(crawl_id, org)
+
+ # can only QA finished crawls
+ if not crawl.finished:
+ raise HTTPException(status_code=400, detail="crawl_not_finished")
+
+ # can only QA successfully finished crawls
+ if crawl.state not in SUCCESSFUL_STATES:
+ raise HTTPException(status_code=400, detail="crawl_did_not_succeed")
+
+ # can only run one QA at a time
+ if crawl.qa:
+ raise HTTPException(status_code=400, detail="qa_already_running")
+
+ # not a valid crawl
+ if not crawl.cid or crawl.type != "crawl":
+ raise HTTPException(status_code=400, detail="invalid_crawl_for_qa")
+
+ crawlconfig = await self.crawl_configs.prepare_for_run_crawl(crawl.cid, org)
+
+ try:
+ qa_run_id = await self.crawl_manager.create_qa_crawl_job(
+ crawlconfig,
+ org.storage,
+ userid=str(user.id),
+ qa_source=crawl_id,
+ )
+
+ image = self.crawl_configs.get_channel_crawler_image(
+ crawlconfig.crawlerChannel
+ )
+
+ qa_run = QARun(
+ id=qa_run_id,
+ started=datetime.now(),
+ userid=user.id,
+ userName=user.name,
+ state="starting",
+ image=image,
+ )
+
+ await self.crawls.find_one_and_update(
+ {"_id": crawl_id},
+ {
+ "$set": {
+ "qa": qa_run.dict(),
+ }
+ },
+ )
+
+ return qa_run_id
+
+ except Exception as exc:
+ # pylint: disable=raise-missing-from
+ raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
+
+ async def stop_crawl_qa_run(self, crawl_id: str, org: Organization):
+ """Stop crawl QA run, QA run removed when actually finished"""
+ crawl = await self.get_crawl(crawl_id, org)
+
+ if not crawl.qa:
+ raise HTTPException(status_code=400, detail="qa_not_running")
+
+ try:
+ result = await self.crawl_manager.shutdown_crawl(crawl.qa.id, graceful=True)
+
+ if result.get("error") == "Not Found":
+ # treat as success, qa crawl no longer exists, so mark as no qa
+ result = {"success": True}
+
+ return result
+
+ except Exception as exc:
+ # pylint: disable=raise-missing-from
+ # if reached here, probably crawl doesn't exist anymore
+ raise HTTPException(
+ status_code=404, detail=f"crawl_not_found, (details: {exc})"
+ )
+
+ async def delete_crawl_qa_runs(self, crawl_id: str, delete_list: DeleteQARunList):
+ """delete specified finished QA run"""
+
+ count = 0
+ for qa_run_id in delete_list.qa_run_ids:
+ res = await self.crawls.find_one_and_update(
+ {"_id": crawl_id, "type": "crawl"},
+ {"$unset": {f"qaFinished.{qa_run_id}": ""}},
+ )
+
+ if res:
+ count += 1
+
+ await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id)
+
+ return {"deleted": count}
+
+ async def qa_run_finished(self, crawl_id: str):
+ """clear active qa, add qa run to finished list, if successful"""
+ crawl = await self.get_crawl(crawl_id)
+
+ if not crawl.qa:
+ return False
+
+ query: Dict[str, Any] = {"qa": None}
+
+ if crawl.qa.finished and crawl.qa.state in SUCCESSFUL_STATES:
+ query[f"qaFinished.{crawl.qa.id}"] = crawl.qa.dict()
+
+ if await self.crawls.find_one_and_update(
+ {"_id": crawl_id, "type": "crawl"}, {"$set": query}
+ ):
+ return True
+
+ return False
+
+ async def get_qa_runs(
+ self, crawl_id: str, org: Optional[Organization] = None
+ ) -> List[QARunOut]:
+ """Return list of QA runs"""
+ crawl_data = await self.get_crawl_raw(
+ crawl_id, org, "crawl", project={"qaFinished": True, "qa": True}
+ )
+ qa_finished = crawl_data.get("qaFinished") or {}
+ all_qa = [QARunOut(**qa_run_data) for qa_run_data in qa_finished.values()]
+ all_qa.sort(key=lambda x: x.finished or dt_now(), reverse=True)
+ qa = crawl_data.get("qa")
+ if qa:
+ all_qa.insert(0, QARunOut(**qa))
+ return all_qa
+
+ async def get_active_qa(
+ self, crawl_id: str, org: Optional[Organization] = None
+ ) -> Optional[QARunOut]:
+ """return just the active QA, if any"""
+ crawl_data = await self.get_crawl_raw(
+ crawl_id, org, "crawl", project={"qa": True}
+ )
+ qa = crawl_data.get("qa")
+ return QARunOut(**qa) if qa else None
+
+ async def get_qa_run_for_replay(
+ self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None
+ ) -> QARunWithResources:
+ """Fetch QA runs with resources for replay.json"""
+ crawl = await self.get_crawl(crawl_id, org)
+ qa_finished = crawl.qaFinished or {}
+ qa_run = qa_finished.get(qa_run_id)
+
+ if not qa_run:
+ raise HTTPException(status_code=404, detail="crawl_qa_not_found")
+
+ if not org:
+ org = await self.orgs.get_org_by_id(crawl.oid)
+ if not org:
+ raise HTTPException(status_code=400, detail="missing_org")
+
+ resources = await self._resolve_signed_urls(
+ qa_run.files, org, crawl.id, qa_run_id
+ )
+
+ qa_run.files = []
+
+ qa_run_dict = qa_run.dict()
+ qa_run_dict["resources"] = resources
+
+ return QARunWithResources(**qa_run_dict)
+
# ============================================================================
async def recompute_crawl_file_count_and_size(crawls, crawl_id):
@@ -590,11 +857,11 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id):
# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
-def init_crawls_api(app, user_dep, *args):
+def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
"""API for crawl management, including crawl done callback"""
# pylint: disable=invalid-name, duplicate-code
- ops = CrawlOps(*args)
+ ops = CrawlOps(crawl_manager, *args)
org_viewer_dep = ops.orgs.org_viewer_dep
org_crawl_dep = ops.orgs.org_crawl_dep
@@ -744,15 +1011,81 @@ async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
- return await ops.get_crawl(crawl_id, None, "crawl")
+ return await ops.get_crawl_out(crawl_id, None, "crawl")
@app.get(
"/orgs/{oid}/crawls/{crawl_id}/replay.json",
tags=["crawls"],
response_model=CrawlOutWithResources,
)
- async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)):
- return await ops.get_crawl(crawl_id, org, "crawl")
+ async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
+ return await ops.get_crawl_out(crawl_id, org, "crawl")
+
+ # QA APIs
+ # ---------------------
+ @app.get(
+ "/orgs/all/crawls/{crawl_id}/qa/{qa_run_id}/replay.json",
+ tags=["qa"],
+ response_model=QARunWithResources,
+ )
+ async def get_qa_run_admin(crawl_id, qa_run_id, user: User = Depends(user_dep)):
+ if not user.is_superuser:
+ raise HTTPException(status_code=403, detail="Not Allowed")
+
+ return await ops.get_qa_run_for_replay(crawl_id, qa_run_id)
+
+ @app.get(
+ "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/replay.json",
+ tags=["qa"],
+ response_model=QARunWithResources,
+ )
+ async def get_qa_run(
+ crawl_id, qa_run_id, org: Organization = Depends(org_viewer_dep)
+ ):
+ return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org)
+
+ @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["qa"])
+ async def start_crawl_qa_run(
+ crawl_id: str,
+ org: Organization = Depends(org_crawl_dep),
+ user: User = Depends(user_dep),
+ ):
+ qa_run_id = await ops.start_crawl_qa_run(crawl_id, org, user)
+ return {"started": qa_run_id}
+
+ @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/stop", tags=["qa"])
+ async def stop_crawl_qa_run(
+ crawl_id: str, org: Organization = Depends(org_crawl_dep)
+ ):
+ # pylint: disable=unused-argument
+ return await ops.stop_crawl_qa_run(crawl_id, org)
+
+ @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/delete", tags=["qa"])
+ async def delete_crawl_qa_runs(
+ crawl_id: str,
+ qa_run_ids: DeleteQARunList,
+ org: Organization = Depends(org_crawl_dep),
+ ):
+ # pylint: disable=unused-argument
+ return await ops.delete_crawl_qa_runs(crawl_id, qa_run_ids)
+
+ @app.get(
+ "/orgs/{oid}/crawls/{crawl_id}/qa",
+ tags=["qa"],
+ response_model=List[QARunOut],
+ )
+ async def get_qa_runs(crawl_id, org: Organization = Depends(org_viewer_dep)):
+ return await ops.get_qa_runs(crawl_id, org)
+
+ @app.get(
+ "/orgs/{oid}/crawls/{crawl_id}/qa/activeQA",
+ tags=["qa"],
+ response_model=Dict[str, Optional[QARunOut]],
+ )
+ async def get_active_qa(crawl_id, org: Organization = Depends(org_viewer_dep)):
+ return {"qa": await ops.get_active_qa(crawl_id, org)}
+
+ # ----
@app.get(
"/orgs/all/crawls/{crawl_id}",
@@ -889,7 +1222,7 @@ async def stream_crawl_logs(
logLevel: Optional[str] = None,
context: Optional[str] = None,
):
- crawl = await ops.get_crawl(crawl_id, org, "crawl")
+ crawl = await ops.get_crawl_out(crawl_id, org)
log_levels = []
contexts = []
@@ -898,11 +1231,10 @@ async def stream_crawl_logs(
if context:
contexts = context.split(",")
- # If crawl is finished, stream logs from WACZ files
+ # If crawl is finished, stream logs from WACZ files using presigned urls
if crawl.finished:
- wacz_files = await ops.get_wacz_files(crawl_id, org)
resp = await ops.storage_ops.sync_stream_wacz_logs(
- org, wacz_files, log_levels, contexts
+ crawl.resources or [], log_levels, contexts
)
return StreamingResponse(
resp,
@@ -924,18 +1256,13 @@ async def get_crawl_errors(
page: int = 1,
org: Organization = Depends(org_viewer_dep),
):
- crawl_raw = await ops.get_crawl_raw(crawl_id, org)
- crawl = Crawl.from_dict(crawl_raw)
+ crawl = await ops.get_crawl(crawl_id, org)
- if crawl.finished:
- skip = (page - 1) * pageSize
- upper_bound = skip + pageSize
- errors = crawl.errors[skip:upper_bound]
- parsed_errors = parse_jsonl_error_messages(errors)
- total = len(crawl.errors)
- return paginated_format(parsed_errors, total, page, pageSize)
-
- errors, total = await ops.get_errors_from_redis(crawl_id, pageSize, page)
- return paginated_format(errors, total, page, pageSize)
+ skip = (page - 1) * pageSize
+ upper_bound = skip + pageSize
+
+ errors = crawl.errors[skip:upper_bound] if crawl.errors else []
+ parsed_errors = parse_jsonl_error_messages(errors)
+ return paginated_format(parsed_errors, len(crawl.errors or []), page, pageSize)
return ops
diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py
index d91c6d2a80..93207ea12a 100644
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@@ -1,13 +1,14 @@
"""
Browsertrix API Mongo DB initialization
"""
+
import importlib.util
import os
import urllib
import asyncio
from uuid import UUID
-from typing import Optional, Union
+from typing import Optional, Union, TypeVar, Type
import motor.motor_asyncio
from pydantic import BaseModel
@@ -79,6 +80,7 @@ async def update_and_prepare_db(
coll_ops,
invite_ops,
storage_ops,
+ page_ops,
db_inited,
):
"""Prepare database for application.
@@ -91,10 +93,16 @@ async def update_and_prepare_db(
"""
await ping_db(mdb)
print("Database setup started", flush=True)
- if await run_db_migrations(mdb, user_manager):
+ if await run_db_migrations(mdb, user_manager, page_ops):
await drop_indexes(mdb)
await create_indexes(
- org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops, user_manager
+ org_ops,
+ crawl_ops,
+ crawl_config_ops,
+ coll_ops,
+ invite_ops,
+ user_manager,
+ page_ops,
)
await user_manager.create_super_user()
await org_ops.create_default_org()
@@ -104,7 +112,7 @@ async def update_and_prepare_db(
# ============================================================================
-async def run_db_migrations(mdb, user_manager):
+async def run_db_migrations(mdb, user_manager, page_ops):
"""Run database migrations."""
# if first run, just set version and exit
@@ -136,7 +144,7 @@ async def run_db_migrations(mdb, user_manager):
assert spec.loader
migration_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(migration_module)
- migration = migration_module.Migration(mdb)
+ migration = migration_module.Migration(mdb, page_ops=page_ops)
if await migration.run():
migrations_run = True
except ImportError as err:
@@ -184,7 +192,7 @@ async def drop_indexes(mdb):
# ============================================================================
# pylint: disable=too-many-arguments
async def create_indexes(
- org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops, user_manager
+ org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops, user_manager, page_ops
):
"""Create database indexes."""
print("Creating database indexes", flush=True)
@@ -194,6 +202,11 @@ async def create_indexes(
await coll_ops.init_index()
await invite_ops.init_index()
await user_manager.init_index()
+ await page_ops.init_index()
+
+
+# ============================================================================
+T = TypeVar("T")
# ============================================================================
@@ -208,10 +221,10 @@ def id_str(self):
return str(self.id)
@classmethod
- def from_dict(cls, data):
+ def from_dict(cls: Type[T], data: dict) -> T:
"""convert dict from mongo to a class"""
if not data:
- return None
+ return cls()
data["id"] = data.pop("_id")
return cls(**data)
diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py
index c2d70f4458..578c22692e 100644
--- a/backend/btrixcloud/k8sapi.py
+++ b/backend/btrixcloud/k8sapi.py
@@ -1,4 +1,5 @@
""" K8S API Access """
+
import os
import traceback
@@ -29,7 +30,9 @@ def __init__(self):
self.namespace = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"
self.custom_resources = {}
- self.templates = Jinja2Templates(directory=get_templates_dir())
+ self.templates = Jinja2Templates(
+ directory=get_templates_dir(), autoescape=False
+ )
config.load_incluster_config()
self.client = client
@@ -66,7 +69,10 @@ def get_redis_url(self, crawl_id):
async def get_redis_client(self, redis_url):
"""return redis client with correct params for one-time use"""
return aioredis.from_url(
- redis_url, decode_responses=True, auto_close_connection_pool=True
+ redis_url,
+ decode_responses=True,
+ auto_close_connection_pool=True,
+ socket_timeout=20,
)
# pylint: disable=too-many-arguments, too-many-locals
@@ -82,6 +88,8 @@ def new_crawl_job_yaml(
max_crawl_size=0,
manual=True,
crawl_id=None,
+ warc_prefix="",
+ qa_source="",
):
"""load job template from yaml"""
if not crawl_id:
@@ -100,6 +108,8 @@ def new_crawl_job_yaml(
"storage_name": str(storage),
"manual": "1" if manual else "0",
"crawler_channel": crawler_channel,
+ "warc_prefix": warc_prefix,
+ "qa_source": qa_source,
}
data = self.templates.env.get_template("crawl_job.yaml").render(params)
@@ -167,12 +177,14 @@ async def has_storage_secret(self, storage_secret) -> bool:
async def delete_crawl_job(self, crawl_id):
"""delete custom crawljob object"""
try:
+ name = f"crawljob-{crawl_id}"
+
await self.custom_api.delete_namespaced_custom_object(
group="btrix.cloud",
version="v1",
namespace=self.namespace,
plural="crawljobs",
- name=f"crawljob-{crawl_id}",
+ name=name,
grace_period_seconds=0,
# delete as background to allow operator to do proper cleanup
propagation_policy="Background",
@@ -210,20 +222,17 @@ async def get_profile_browser(self, browserid):
)
async def _patch_job(self, crawl_id, body, pluraltype="crawljobs") -> dict:
- content_type = self.api_client.default_headers.get("Content-Type")
-
try:
- self.api_client.set_default_header(
- "Content-Type", "application/merge-patch+json"
- )
+ name = f"{pluraltype[:-1]}-{crawl_id}"
await self.custom_api.patch_namespaced_custom_object(
group="btrix.cloud",
version="v1",
namespace=self.namespace,
plural=pluraltype,
- name=f"{pluraltype[:-1]}-{crawl_id}",
+ name=name,
body={"spec": body},
+ _content_type="application/merge-patch+json",
)
return {"success": True}
# pylint: disable=broad-except
@@ -231,12 +240,6 @@ async def _patch_job(self, crawl_id, body, pluraltype="crawljobs") -> dict:
traceback.print_exc()
return {"error": str(exc)}
- finally:
- if content_type:
- self.api_client.set_default_header("Content-Type", content_type)
- else:
- del self.api_client.default_headers["Content-Type"]
-
async def print_pod_logs(self, pod_names, lines=100):
"""print pod logs"""
for pod in pod_names:
diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py
index 2908d6b42f..4378a8701b 100644
--- a/backend/btrixcloud/main.py
+++ b/backend/btrixcloud/main.py
@@ -2,6 +2,7 @@
main file for browsertrix-api system
supports docker and kubernetes based deployments of multiple browsertrix-crawlers
"""
+
import os
import asyncio
import sys
@@ -28,6 +29,7 @@
from .basecrawls import init_base_crawls_api
from .webhooks import init_event_webhooks_api
from .background_jobs import init_background_jobs_api
+from .pages import init_pages_api
from .crawlmanager import CrawlManager
from .utils import run_once_lock, register_exit_handler, is_bool
@@ -65,6 +67,7 @@ def main():
os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120)
),
"maxPagesPerCrawl": int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)),
+ "maxScale": int(os.environ.get("MAX_CRAWL_SCALE", 3)),
}
invites = init_invites(mdb, email)
@@ -131,10 +134,10 @@ def main():
base_crawl_init = (
app,
current_active_user,
+ # to basecrawls
mdb,
user_manager,
org_ops,
- crawl_manager,
crawl_config_ops,
coll_ops,
storage_ops,
@@ -144,7 +147,14 @@ def main():
base_crawl_ops = init_base_crawls_api(*base_crawl_init)
- crawls = init_crawls_api(*base_crawl_init)
+ crawls = init_crawls_api(crawl_manager, *base_crawl_init)
+
+ page_ops = init_pages_api(
+ app, mdb, crawls, org_ops, storage_ops, current_active_user
+ )
+
+ base_crawl_ops.set_page_ops(page_ops)
+ crawls.set_page_ops(page_ops)
init_uploads_api(*base_crawl_init)
@@ -168,6 +178,7 @@ def main():
coll_ops,
invites,
storage_ops,
+ page_ops,
db_inited,
)
)
@@ -188,12 +199,20 @@ async def get_settings():
async def openapi() -> JSONResponse:
return JSONResponse(app_root.openapi())
- @app_root.get("/healthz", include_in_schema=False)
- async def healthz():
+ # Used for startup
+ # Returns 200 only when db is available + migrations are done
+ @app_root.get("/healthzStartup", include_in_schema=False)
+ async def healthz_startup():
if not db_inited.get("inited"):
raise HTTPException(status_code=503, detail="not_ready_yet")
return {}
+ # Used for readiness + liveness
+ # Always returns 200 while running
+ @app_root.get("/healthz", include_in_schema=False)
+ async def healthz():
+ return {}
+
app_root.include_router(app, prefix=API_PREFIX)
diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py
index 751c17527e..2b2119821e 100644
--- a/backend/btrixcloud/main_op.py
+++ b/backend/btrixcloud/main_op.py
@@ -21,6 +21,7 @@
from .storages import init_storages_api
from .webhooks import EventWebhookOps
from .background_jobs import BackgroundJobOps
+from .pages import PageOps
app_root = FastAPI()
@@ -76,10 +77,10 @@ def main():
coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops)
crawl_ops = CrawlOps(
+ crawl_manager,
mdb,
user_manager,
org_ops,
- crawl_manager,
crawl_config_ops,
coll_ops,
storage_ops,
@@ -87,6 +88,10 @@ def main():
background_job_ops,
)
+ page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops)
+
+ crawl_ops.set_page_ops(page_ops)
+
background_job_ops.set_ops(crawl_ops, profile_ops)
return init_operator_api(
@@ -98,6 +103,7 @@ def main():
storage_ops,
event_webhook_ops,
background_job_ops,
+ page_ops,
)
@@ -106,5 +112,5 @@ def main():
async def startup():
"""init on startup"""
register_exit_handler()
- oper = main()
- await oper.async_init()
+ settings = main()
+ await settings.async_init()
diff --git a/backend/btrixcloud/migrations/__init__.py b/backend/btrixcloud/migrations/__init__.py
index 7488769f6c..e983072b23 100644
--- a/backend/btrixcloud/migrations/__init__.py
+++ b/backend/btrixcloud/migrations/__init__.py
@@ -1,6 +1,7 @@
"""
BaseMigration class to subclass in each migration module
"""
+
import os
from pymongo.errors import OperationFailure
diff --git a/backend/btrixcloud/migrations/migration_0001_archives_to_orgs.py b/backend/btrixcloud/migrations/migration_0001_archives_to_orgs.py
index 79b7139b99..49742d2fe7 100644
--- a/backend/btrixcloud/migrations/migration_0001_archives_to_orgs.py
+++ b/backend/btrixcloud/migrations/migration_0001_archives_to_orgs.py
@@ -1,6 +1,7 @@
"""
Migration 0001 - Archives to Orgs
"""
+
import os
from pymongo.errors import OperationFailure
@@ -23,8 +24,9 @@ class Migration(BaseMigration):
"profiles",
]
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up."""
diff --git a/backend/btrixcloud/migrations/migration_0002_crawlconfig_crawlstats.py b/backend/btrixcloud/migrations/migration_0002_crawlconfig_crawlstats.py
index 3f24b6fab6..ad148c93e0 100644
--- a/backend/btrixcloud/migrations/migration_0002_crawlconfig_crawlstats.py
+++ b/backend/btrixcloud/migrations/migration_0002_crawlconfig_crawlstats.py
@@ -1,6 +1,7 @@
"""
Migration 0002 - Dropping CrawlConfig crawl stats
"""
+
from btrixcloud.migrations import BaseMigration
@@ -10,8 +11,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py b/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py
index f773316c9a..c0427bc582 100644
--- a/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py
+++ b/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py
@@ -1,6 +1,7 @@
"""
Migration 0003 - Mutable crawl configs and crawl revision history
"""
+
from datetime import datetime
from btrixcloud.models import Crawl, CrawlConfig
@@ -13,8 +14,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0004_config_seeds.py b/backend/btrixcloud/migrations/migration_0004_config_seeds.py
index a4e9f66e09..8f6e5ff6c2 100644
--- a/backend/btrixcloud/migrations/migration_0004_config_seeds.py
+++ b/backend/btrixcloud/migrations/migration_0004_config_seeds.py
@@ -1,6 +1,7 @@
"""
Migration 0004 - Ensuring all config.seeds are Seeds not HttpUrls
"""
+
from pydantic import HttpUrl
from btrixcloud.models import Crawl, CrawlConfig, ScopeType, Seed
@@ -13,8 +14,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
@@ -100,12 +102,14 @@ async def migrate_up(self):
# Test migration
async for config_dict in crawl_configs.find({}):
config = CrawlConfig.from_dict(config_dict)
- for seed in config.config.seeds:
+ seeds = config.config.seeds or []
+ for seed in seeds:
assert isinstance(seed, Seed)
assert seed.url
async for crawl_dict in crawls.find({}):
crawl = Crawl.from_dict(crawl_dict)
- for seed in crawl.config.seeds:
+ seeds = crawl.config.seeds or []
+ for seed in seeds:
assert isinstance(seed, Seed)
assert seed.url
diff --git a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py
index 7134f63c04..ab225e54e7 100644
--- a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py
+++ b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py
@@ -1,6 +1,7 @@
"""
Migration 0005 - Updating scheduled cron jobs after Operator changes
"""
+
from btrixcloud.models import CrawlConfig, UpdateCrawlConfig
from btrixcloud.crawlmanager import CrawlManager
from btrixcloud.migrations import BaseMigration
@@ -12,8 +13,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py b/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py
index f920bd64a3..3af7ebddca 100644
--- a/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py
+++ b/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py
@@ -1,6 +1,7 @@
"""
Migration 0006 - Precomputing workflow crawl stats
"""
+
from btrixcloud.crawlconfigs import stats_recompute_all
from btrixcloud.migrations import BaseMigration
@@ -11,8 +12,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py b/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py
index ec5d2e94dd..f5b0efe0d0 100644
--- a/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py
+++ b/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py
@@ -4,6 +4,7 @@
- Rename colls to autoAddCollections
- Re-calculate workflow crawl stats to populate crawlSuccessfulCount
"""
+
from btrixcloud.crawlconfigs import stats_recompute_all
from btrixcloud.migrations import BaseMigration
@@ -14,8 +15,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up."""
diff --git a/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py b/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py
index 1bc8db2ac4..63856e543e 100644
--- a/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py
+++ b/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py
@@ -1,6 +1,7 @@
"""
Migration 0008 - Precomputing crawl file stats
"""
+
from btrixcloud.crawls import recompute_crawl_file_count_and_size
from btrixcloud.migrations import BaseMigration
@@ -11,8 +12,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0009_crawl_types.py b/backend/btrixcloud/migrations/migration_0009_crawl_types.py
index 54c6f490d7..08e5bc60a5 100644
--- a/backend/btrixcloud/migrations/migration_0009_crawl_types.py
+++ b/backend/btrixcloud/migrations/migration_0009_crawl_types.py
@@ -1,6 +1,7 @@
"""
Migration 0009 - Crawl types
"""
+
from btrixcloud.migrations import BaseMigration
@@ -10,8 +11,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0010_collection_total_size.py b/backend/btrixcloud/migrations/migration_0010_collection_total_size.py
index e2a2eb3d2b..8e6234954a 100644
--- a/backend/btrixcloud/migrations/migration_0010_collection_total_size.py
+++ b/backend/btrixcloud/migrations/migration_0010_collection_total_size.py
@@ -1,6 +1,7 @@
"""
Migration 0010 - Precomputing collection total size
"""
+
from btrixcloud.colls import CollectionOps
from btrixcloud.migrations import BaseMigration
@@ -11,8 +12,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0011_crawl_timeout_configmap.py b/backend/btrixcloud/migrations/migration_0011_crawl_timeout_configmap.py
index 3a9b2abd81..6793e90f38 100644
--- a/backend/btrixcloud/migrations/migration_0011_crawl_timeout_configmap.py
+++ b/backend/btrixcloud/migrations/migration_0011_crawl_timeout_configmap.py
@@ -1,6 +1,7 @@
"""
Migration 0011 - Remove None CRAWL_TIMEOUT values from configmaps
"""
+
import os
from btrixcloud.k8sapi import K8sAPI
@@ -14,8 +15,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0012_notes_to_description.py b/backend/btrixcloud/migrations/migration_0012_notes_to_description.py
index b6fb6d70b8..7e33763421 100644
--- a/backend/btrixcloud/migrations/migration_0012_notes_to_description.py
+++ b/backend/btrixcloud/migrations/migration_0012_notes_to_description.py
@@ -1,6 +1,7 @@
"""
Migration 0012 - Notes to description
"""
+
from btrixcloud.migrations import BaseMigration
@@ -10,8 +11,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0013_crawl_name.py b/backend/btrixcloud/migrations/migration_0013_crawl_name.py
index 3c120f88d0..6bf57fefa7 100644
--- a/backend/btrixcloud/migrations/migration_0013_crawl_name.py
+++ b/backend/btrixcloud/migrations/migration_0013_crawl_name.py
@@ -1,6 +1,7 @@
"""
Migration 0013 - Copy config name to crawls
"""
+
from btrixcloud.migrations import BaseMigration
@@ -10,8 +11,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py b/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py
index 8243136c33..ac18163a02 100644
--- a/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py
+++ b/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py
@@ -1,6 +1,7 @@
"""
Migration 0014 - collections to collectionIDs
"""
+
from btrixcloud.migrations import BaseMigration
@@ -10,8 +11,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py b/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py
index c1582ee406..dd006966d0 100644
--- a/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py
+++ b/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py
@@ -1,6 +1,7 @@
"""
Migration 0015 - Calculate and store org storage usage
"""
+
from btrixcloud.migrations import BaseMigration
@@ -11,8 +12,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py b/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py
index b1cd648ed5..0e8ff0a449 100644
--- a/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py
+++ b/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py
@@ -1,6 +1,7 @@
"""
Migration 0016 - Updating scheduled cron jobs after Operator changes v2
"""
+
import os
from btrixcloud.models import CrawlConfig, UpdateCrawlConfig
from btrixcloud.crawlmanager import CrawlManager
@@ -13,8 +14,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0017_storage_by_type.py b/backend/btrixcloud/migrations/migration_0017_storage_by_type.py
index ae2b6e6fd0..2549ec6f4c 100644
--- a/backend/btrixcloud/migrations/migration_0017_storage_by_type.py
+++ b/backend/btrixcloud/migrations/migration_0017_storage_by_type.py
@@ -1,6 +1,7 @@
"""
Migration 0017 - Calculate and store org storage usage by type
"""
+
from btrixcloud.migrations import BaseMigration
@@ -11,8 +12,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0018_usernames.py b/backend/btrixcloud/migrations/migration_0018_usernames.py
index 9b6afe794a..5129a02853 100644
--- a/backend/btrixcloud/migrations/migration_0018_usernames.py
+++ b/backend/btrixcloud/migrations/migration_0018_usernames.py
@@ -1,6 +1,7 @@
"""
Migration 0018 - Store crawl and workflow userName directly in db
"""
+
from btrixcloud.migrations import BaseMigration
from btrixcloud.emailsender import EmailSender
@@ -15,8 +16,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0019_org_slug.py b/backend/btrixcloud/migrations/migration_0019_org_slug.py
index defe4dfcad..ec3e08cd88 100644
--- a/backend/btrixcloud/migrations/migration_0019_org_slug.py
+++ b/backend/btrixcloud/migrations/migration_0019_org_slug.py
@@ -1,6 +1,7 @@
"""
Migration 0019 - Organization slug
"""
+
from btrixcloud.migrations import BaseMigration
from btrixcloud.utils import slug_from_name
@@ -11,8 +12,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0020_org_storage_refs.py b/backend/btrixcloud/migrations/migration_0020_org_storage_refs.py
index 291d79d633..9704b31bc9 100644
--- a/backend/btrixcloud/migrations/migration_0020_org_storage_refs.py
+++ b/backend/btrixcloud/migrations/migration_0020_org_storage_refs.py
@@ -1,6 +1,7 @@
"""
Migration 0020 - New Storage Ref System
"""
+
from btrixcloud.migrations import BaseMigration
@@ -10,8 +11,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0021_profile_filenames.py b/backend/btrixcloud/migrations/migration_0021_profile_filenames.py
index 1f9cd4c71e..56be52ab75 100644
--- a/backend/btrixcloud/migrations/migration_0021_profile_filenames.py
+++ b/backend/btrixcloud/migrations/migration_0021_profile_filenames.py
@@ -1,6 +1,7 @@
"""
Migration 0021 - Profile filenames
"""
+
from btrixcloud.crawlmanager import CrawlManager
from btrixcloud.migrations import BaseMigration
from btrixcloud.models import CrawlConfig, Profile, UpdateCrawlConfig
@@ -13,8 +14,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0022_partial_complete.py b/backend/btrixcloud/migrations/migration_0022_partial_complete.py
index 7b90bbb525..88c2190d4b 100644
--- a/backend/btrixcloud/migrations/migration_0022_partial_complete.py
+++ b/backend/btrixcloud/migrations/migration_0022_partial_complete.py
@@ -1,6 +1,7 @@
"""
Migration 0022 -- Partial Complete
"""
+
from btrixcloud.migrations import BaseMigration
@@ -10,8 +11,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0023_available_extra_exec_mins.py b/backend/btrixcloud/migrations/migration_0023_available_extra_exec_mins.py
index b0ac3d98f0..fa50f35247 100644
--- a/backend/btrixcloud/migrations/migration_0023_available_extra_exec_mins.py
+++ b/backend/btrixcloud/migrations/migration_0023_available_extra_exec_mins.py
@@ -1,6 +1,7 @@
"""
Migration 0023 -- Available extra/gifted minutes
"""
+
from btrixcloud.migrations import BaseMigration
@@ -10,8 +11,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0024_crawlerchannel.py b/backend/btrixcloud/migrations/migration_0024_crawlerchannel.py
index 6ffa4214c1..afbb1ae7af 100644
--- a/backend/btrixcloud/migrations/migration_0024_crawlerchannel.py
+++ b/backend/btrixcloud/migrations/migration_0024_crawlerchannel.py
@@ -1,6 +1,7 @@
"""
Migration 0024 -- crawlerChannel
"""
+
from btrixcloud.crawlmanager import CrawlManager
from btrixcloud.migrations import BaseMigration
from btrixcloud.models import CrawlConfig, UpdateCrawlConfig
@@ -12,8 +13,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/migrations/migration_0025_workflow_db_configmap_fixes.py b/backend/btrixcloud/migrations/migration_0025_workflow_db_configmap_fixes.py
index 6c159e69b5..170318efbf 100644
--- a/backend/btrixcloud/migrations/migration_0025_workflow_db_configmap_fixes.py
+++ b/backend/btrixcloud/migrations/migration_0025_workflow_db_configmap_fixes.py
@@ -1,6 +1,7 @@
"""
Migration 0025 -- fix workflow database and configmap issues.
"""
+
from btrixcloud.crawlmanager import CrawlManager
from btrixcloud.migrations import BaseMigration
from btrixcloud.models import CrawlConfig, UpdateCrawlConfig
@@ -12,8 +13,9 @@
class Migration(BaseMigration):
"""Migration class."""
- def __init__(self, mdb, migration_version=MIGRATION_VERSION):
- super().__init__(mdb, migration_version)
+ # pylint: disable=unused-argument
+ def __init__(self, mdb, **kwargs):
+ super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
index c14340e3e3..3621032831 100644
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@@ -387,6 +387,8 @@ def get_raw_config(self):
class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
"""Crawl Config Output"""
+ id: UUID
+
lastCrawlStopping: Optional[bool] = False
profileName: Optional[str]
firstSeed: Optional[str]
@@ -523,36 +525,70 @@ class CrawlFileOut(BaseModel):
# ============================================================================
-class BaseCrawl(BaseMongoModel):
- """Base Crawl object (representing crawls, uploads and manual sessions)"""
+class ReviewStatus(str, Enum):
+ """QA review statuses"""
- id: str
+ GOOD = "good"
+ ACCEPTABLE = "acceptable"
+ FAILURE = "failure"
+
+
+# ============================================================================
+class CrawlStats(BaseModel):
+ """Crawl Stats for pages and size"""
+
+ found: int = 0
+ done: int = 0
+ size: int = 0
- type: str
+
+# ============================================================================
+class CoreCrawlable(BaseModel):
+ # pylint: disable=too-few-public-methods
+ """Core properties for crawlable run (crawl or qa run)"""
+
+ id: str
userid: UUID
userName: Optional[str]
- oid: UUID
started: datetime
finished: Optional[datetime] = None
- name: Optional[str] = ""
-
state: str
- stats: Optional[Dict[str, int]] = None
+ crawlExecSeconds: int = 0
- files: Optional[List[CrawlFile]] = []
+ image: Optional[str]
- description: Optional[str] = ""
+ stats: Optional[CrawlStats] = CrawlStats()
+
+ files: List[CrawlFile] = []
+
+ fileSize: int = 0
+ fileCount: int = 0
errors: Optional[List[str]] = []
+
+# ============================================================================
+class BaseCrawl(CoreCrawlable, BaseMongoModel):
+ """Base Crawl object (representing crawls, uploads and manual sessions)"""
+
+ type: str
+
+ oid: UUID
+ cid: Optional[UUID] = None
+
+ name: Optional[str] = ""
+
+ description: Optional[str] = ""
+
+ tags: Optional[List[str]] = []
+
collectionIds: Optional[List[UUID]] = []
- fileSize: int = 0
- fileCount: int = 0
+ reviewStatus: Optional[ReviewStatus] = None
# ============================================================================
@@ -587,7 +623,7 @@ class CrawlOut(BaseMongoModel):
state: str
- stats: Optional[Dict[str, int]]
+ stats: Optional[CrawlStats]
fileSize: int = 0
fileCount: int = 0
@@ -599,6 +635,7 @@ class CrawlOut(BaseMongoModel):
collectionIds: Optional[List[UUID]] = []
crawlExecSeconds: int = 0
+ qaCrawlExecSeconds: int = 0
# automated crawl fields
config: Optional[RawCrawlConfig]
@@ -617,6 +654,8 @@ class CrawlOut(BaseMongoModel):
crawlerChannel: str = "default"
image: Optional[str]
+ reviewStatus: Optional[ReviewStatus] = None
+
# ============================================================================
class CrawlOutWithResources(CrawlOut):
@@ -634,6 +673,7 @@ class UpdateCrawl(BaseModel):
description: Optional[str]
tags: Optional[List[str]]
collectionIds: Optional[List[UUID]]
+ reviewStatus: Optional[ReviewStatus]
# ============================================================================
@@ -643,6 +683,13 @@ class DeleteCrawlList(BaseModel):
crawl_ids: List[str]
+# ============================================================================
+class DeleteQARunList(BaseModel):
+ """delete qa run list POST body"""
+
+ qa_run_ids: List[str]
+
+
# ============================================================================
### AUTOMATED CRAWLS ###
@@ -655,6 +702,36 @@ class CrawlScale(BaseModel):
scale: conint(ge=1, le=MAX_CRAWL_SCALE) = 1 # type: ignore
+# ============================================================================
+class QARun(CoreCrawlable, BaseModel):
+ """Subdocument to track QA runs for given crawl"""
+
+
+# ============================================================================
+class QARunWithResources(QARun):
+ """QA crawl output model including resources"""
+
+ resources: Optional[List[CrawlFileOut]] = []
+
+
+# ============================================================================
+class QARunOut(BaseModel):
+ """QA Run Output"""
+
+ id: str
+
+ userName: Optional[str]
+
+ started: datetime
+ finished: Optional[datetime] = None
+
+ state: str
+
+ crawlExecSeconds: int = 0
+
+ stats: CrawlStats = CrawlStats()
+
+
# ============================================================================
class Crawl(BaseCrawl, CrawlConfigCore):
"""Store State of a Crawl (Finished or Running)"""
@@ -672,9 +749,10 @@ class Crawl(BaseCrawl, CrawlConfigCore):
stopping: Optional[bool] = False
- crawlExecSeconds: int = 0
+ qaCrawlExecSeconds: int = 0
- image: Optional[str]
+ qa: Optional[QARun] = None
+ qaFinished: Optional[Dict[str, QARun]] = {}
# ============================================================================
@@ -704,8 +782,6 @@ class UploadedCrawl(BaseCrawl):
type: Literal["upload"] = "upload"
- tags: Optional[List[str]] = []
-
# ============================================================================
class UpdateUpload(UpdateCrawl):
@@ -896,8 +972,15 @@ class OrgOut(BaseMongoModel):
storageQuotaReached: Optional[bool]
execMinutesQuotaReached: Optional[bool]
+ # total usage and exec time
usage: Optional[Dict[str, int]]
crawlExecSeconds: Dict[str, int] = {}
+
+ # qa only usage + exec time
+ qaUsage: Optional[Dict[str, int]] = {}
+ qaCrawlExecSeconds: Dict[str, int] = {}
+
+ # exec time limits
monthlyExecSeconds: Dict[str, int] = {}
extraExecSeconds: Dict[str, int] = {}
giftedExecSeconds: Dict[str, int] = {}
@@ -931,8 +1014,15 @@ class Organization(BaseMongoModel):
bytesStoredUploads: int = 0
bytesStoredProfiles: int = 0
+ # total usage + exec time
usage: Dict[str, int] = {}
crawlExecSeconds: Dict[str, int] = {}
+
+ # qa only usage + exec time
+ qaUsage: Dict[str, int] = {}
+ qaCrawlExecSeconds: Dict[str, int] = {}
+
+ # exec time limits
monthlyExecSeconds: Dict[str, int] = {}
extraExecSeconds: Dict[str, int] = {}
giftedExecSeconds: Dict[str, int] = {}
@@ -1263,27 +1353,27 @@ class BaseCollectionItemBody(WebhookNotificationBody):
class CollectionItemAddedBody(BaseCollectionItemBody):
"""Webhook notification POST body for collection additions"""
- event: Literal[
+ event: Literal[WebhookEventType.ADDED_TO_COLLECTION] = (
WebhookEventType.ADDED_TO_COLLECTION
- ] = WebhookEventType.ADDED_TO_COLLECTION
+ )
# ============================================================================
class CollectionItemRemovedBody(BaseCollectionItemBody):
"""Webhook notification POST body for collection removals"""
- event: Literal[
+ event: Literal[WebhookEventType.REMOVED_FROM_COLLECTION] = (
WebhookEventType.REMOVED_FROM_COLLECTION
- ] = WebhookEventType.REMOVED_FROM_COLLECTION
+ )
# ============================================================================
class CollectionDeletedBody(WebhookNotificationBody):
"""Webhook notification base POST body for collection changes"""
- event: Literal[
+ event: Literal[WebhookEventType.COLLECTION_DELETED] = (
WebhookEventType.COLLECTION_DELETED
- ] = WebhookEventType.COLLECTION_DELETED
+ )
collectionId: str
@@ -1414,3 +1504,110 @@ class AnyJob(BaseModel):
"""Union of all job types, for response model"""
__root__: Union[CreateReplicaJob, DeleteReplicaJob, BackgroundJob]
+
+
+# ============================================================================
+
+### PAGES ###
+
+
+# ============================================================================
+class PageReviewUpdate(BaseModel):
+ """Update model for page manual review/approval"""
+
+ approved: Optional[bool] = None
+
+
+# ============================================================================
+class PageNoteIn(BaseModel):
+ """Input model for adding page notes"""
+
+ text: str
+
+
+# ============================================================================
+class PageNoteEdit(BaseModel):
+ """Input model for editing page notes"""
+
+ id: UUID
+ text: str
+
+
+# ============================================================================
+class PageNoteDelete(BaseModel):
+ """Delete model for page notes"""
+
+ delete_list: List[UUID] = []
+
+
+# ============================================================================
+class PageNote(BaseModel):
+ """Model for page notes, tracking user and time"""
+
+ id: UUID
+ text: str
+ created: datetime = datetime.now()
+ userid: UUID
+ userName: str
+
+
+# ============================================================================
+class PageQACompare(BaseModel):
+ """Model for updating pages from QA run"""
+
+ screenshotMatch: Optional[float] = None
+ textMatch: Optional[float] = None
+ resourceCounts: Optional[Dict[str, int]]
+
+
+# ============================================================================
+class Page(BaseMongoModel):
+ """Core page data, no QA"""
+
+ id: UUID
+
+ oid: UUID
+ crawl_id: str
+
+ # core page data
+ url: AnyHttpUrl
+ title: Optional[str] = None
+ ts: Optional[datetime] = None
+ loadState: Optional[int] = None
+ status: Optional[int] = None
+
+ # manual review
+ userid: Optional[UUID] = None
+ modified: Optional[datetime] = None
+ approved: Optional[bool] = None
+ notes: List[PageNote] = []
+
+
+# ============================================================================
+class PageWithAllQA(Page):
+ """Model for core page data + qa"""
+
+ # automated heuristics, keyed by QA run id
+ qa: Optional[Dict[str, PageQACompare]] = {}
+
+
+# ============================================================================
+class PageOut(Page):
+ """Model for pages output, no QA"""
+
+ status: Optional[int] = 200
+
+
+# ============================================================================
+class PageOutWithSingleQA(Page):
+ """Page out with single QA entry"""
+
+ qa: Optional[PageQACompare] = None
+
+
+# ============================================================================
+class PagesAndResources(BaseModel):
+ """moage for qa configmap data, pages + resources"""
+
+ resources: List[CrawlFileOut] = []
+ pages: List[PageOut] = []
diff --git a/backend/btrixcloud/operator/__init__.py b/backend/btrixcloud/operator/__init__.py
new file mode 100644
index 0000000000..dd5f4830da
--- /dev/null
+++ b/backend/btrixcloud/operator/__init__.py
@@ -0,0 +1,28 @@
+""" operators module """
+
+from .profiles import ProfileOperator
+from .bgjobs import BgJobOperator
+from .cronjobs import CronJobOperator
+from .crawls import CrawlOperator
+from .baseoperator import K8sOpAPI
+
+operator_classes = [ProfileOperator, BgJobOperator, CronJobOperator, CrawlOperator]
+
+
+# ============================================================================
+def init_operator_api(app, *args):
+ """registers webhook handlers for metacontroller"""
+
+ k8s = K8sOpAPI()
+
+ operators = []
+ for cls in operator_classes:
+ oper = cls(k8s, *args)
+ oper.init_routes(app)
+ operators.append(oper)
+
+ @app.get("/healthz", include_in_schema=False)
+ async def healthz():
+ return {}
+
+ return k8s
diff --git a/backend/btrixcloud/operator/baseoperator.py b/backend/btrixcloud/operator/baseoperator.py
new file mode 100644
index 0000000000..b06d8bf051
--- /dev/null
+++ b/backend/btrixcloud/operator/baseoperator.py
@@ -0,0 +1,150 @@
+""" Base Operator class for all operators """
+
+import asyncio
+from typing import TYPE_CHECKING
+from kubernetes.utils import parse_quantity
+
+import yaml
+from btrixcloud.k8sapi import K8sAPI
+
+
+if TYPE_CHECKING:
+ from btrixcloud.crawlconfigs import CrawlConfigOps
+ from btrixcloud.crawls import CrawlOps
+ from btrixcloud.orgs import OrgOps
+ from btrixcloud.colls import CollectionOps
+ from btrixcloud.storages import StorageOps
+ from btrixcloud.webhooks import EventWebhookOps
+ from btrixcloud.users import UserManager
+ from btrixcloud.background_jobs import BackgroundJobOps
+ from btrixcloud.pages import PageOps
+ from redis.asyncio.client import Redis
+else:
+ CrawlConfigOps = CrawlOps = OrgOps = CollectionOps = Redis = object
+ StorageOps = EventWebhookOps = UserManager = BackgroundJobOps = PageOps = object
+
+
+# ============================================================================
+class K8sOpAPI(K8sAPI):
+ """Additional k8s api for operators"""
+
+ def __init__(self):
+ super().__init__()
+ self.config_file = "/config/config.yaml"
+ with open(self.config_file, encoding="utf-8") as fh_config:
+ self.shared_params = yaml.safe_load(fh_config)
+
+ self.has_pod_metrics = False
+ self.compute_crawler_resources()
+ self.compute_profile_resources()
+
+ def compute_crawler_resources(self):
+ """compute memory / cpu resources for crawlers"""
+ p = self.shared_params
+ num = max(int(p["crawler_browser_instances"]) - 1, 0)
+ print("crawler resources")
+ if not p.get("crawler_cpu"):
+ base = parse_quantity(p["crawler_cpu_base"])
+ extra = parse_quantity(p["crawler_extra_cpu_per_browser"])
+
+ # cpu is a floating value of cpu cores
+ p["crawler_cpu"] = float(base + num * extra)
+
+ print(f"cpu = {base} + {num} * {extra} = {p['crawler_cpu']}")
+ else:
+ print(f"cpu = {p['crawler_cpu']}")
+
+ if not p.get("crawler_memory"):
+ base = parse_quantity(p["crawler_memory_base"])
+ extra = parse_quantity(p["crawler_extra_memory_per_browser"])
+
+ # memory is always an int
+ p["crawler_memory"] = int(base + num * extra)
+
+ print(f"memory = {base} + {num} * {extra} = {p['crawler_memory']}")
+ else:
+ print(f"memory = {p['crawler_memory']}")
+
+ def compute_profile_resources(self):
+ """compute memory /cpu resources for a single profile browser"""
+ p = self.shared_params
+ # if no profile specific options provided, default to crawler base for one browser
+ profile_cpu = parse_quantity(
+ p.get("profile_browser_cpu") or p["crawler_cpu_base"]
+ )
+ profile_memory = parse_quantity(
+ p.get("profile_browser_memory") or p["crawler_memory_base"]
+ )
+ p["profile_cpu"] = profile_cpu
+ p["profile_memory"] = profile_memory
+
+ print("profile browser resources")
+ print(f"cpu = {profile_cpu}")
+ print(f"memory = {profile_memory}")
+
+ async def async_init(self):
+ """perform any async init here"""
+ self.has_pod_metrics = await self.is_pod_metrics_available()
+ print("Pod Metrics Available:", self.has_pod_metrics)
+
+
+# pylint: disable=too-many-instance-attributes, too-many-arguments
+# ============================================================================
+class BaseOperator:
+ """BaseOperator"""
+
+ k8s: K8sOpAPI
+ crawl_config_ops: CrawlConfigOps
+ crawl_ops: CrawlOps
+ orgs_ops: OrgOps
+ coll_ops: CollectionOps
+ storage_ops: StorageOps
+ event_webhook_ops: EventWebhookOps
+ background_job_ops: BackgroundJobOps
+ user_ops: UserManager
+ page_ops: PageOps
+
+ def __init__(
+ self,
+ k8s,
+ crawl_config_ops,
+ crawl_ops,
+ org_ops,
+ coll_ops,
+ storage_ops,
+ event_webhook_ops,
+ background_job_ops,
+ page_ops,
+ ):
+ self.k8s = k8s
+ self.crawl_config_ops = crawl_config_ops
+ self.crawl_ops = crawl_ops
+ self.org_ops = org_ops
+ self.coll_ops = coll_ops
+ self.storage_ops = storage_ops
+ self.background_job_ops = background_job_ops
+ self.event_webhook_ops = event_webhook_ops
+ self.page_ops = page_ops
+
+ self.user_ops = crawl_config_ops.user_manager
+
+ # to avoid background tasks being garbage collected
+ # see: https://stackoverflow.com/a/74059981
+ self.bg_tasks = set()
+
+ def init_routes(self, app):
+ """init routes for this operator"""
+
+ def run_task(self, func):
+ """add bg tasks to set to avoid premature garbage collection"""
+ task = asyncio.create_task(func)
+ self.bg_tasks.add(task)
+ task.add_done_callback(self.bg_tasks.discard)
+
+ def load_from_yaml(self, filename, params):
+ """load and parse k8s template from yaml file"""
+ return list(
+ yaml.safe_load_all(
+ self.k8s.templates.env.get_template(filename).render(params)
+ )
+ )
diff --git a/backend/btrixcloud/operator/bgjobs.py b/backend/btrixcloud/operator/bgjobs.py
new file mode 100644
index 0000000000..fad3deea41
--- /dev/null
+++ b/backend/btrixcloud/operator/bgjobs.py
@@ -0,0 +1,62 @@
+""" Operator handler for BackgroundJobs """
+
+from uuid import UUID
+import traceback
+
+from btrixcloud.utils import (
+ from_k8s_date,
+ dt_now,
+)
+
+from .models import MCDecoratorSyncData
+from .baseoperator import BaseOperator
+
+
+# ============================================================================
+class BgJobOperator(BaseOperator):
+ """BgJobOperator"""
+
+ def init_routes(self, app):
+ """init routes for this operator"""
+
+ # nop, but needed for metacontroller
+ @app.post("/op/backgroundjob/sync")
+ async def mc_sync_background_jobs():
+ return {"attachments": []}
+
+ @app.post("/op/backgroundjob/finalize")
+ async def mc_finalize_background_jobs(data: MCDecoratorSyncData):
+ return await self.finalize_background_job(data)
+
+ async def finalize_background_job(self, data: MCDecoratorSyncData) -> dict:
+ """handle finished background job"""
+
+ metadata = data.object["metadata"]
+ labels: dict[str, str] = metadata.get("labels", {})
+ oid: str = labels.get("btrix.org") or ""
+ job_type: str = labels.get("job_type") or ""
+ job_id: str = metadata.get("name")
+
+ status = data.object["status"]
+ success = status.get("succeeded") == 1
+ completion_time = status.get("completionTime")
+
+ finalized = True
+
+ finished = from_k8s_date(completion_time) if completion_time else dt_now()
+
+ try:
+ await self.background_job_ops.job_finished(
+ job_id, job_type, UUID(oid), success=success, finished=finished
+ )
+ # print(
+ # f"{job_type} background job completed: success: {success}, {job_id}",
+ # flush=True,
+ # )
+
+ # pylint: disable=broad-except
+ except Exception:
+ print("Update Background Job Error", flush=True)
+ traceback.print_exc()
+
+ return {"attachments": [], "finalized": finalized}
diff --git a/backend/btrixcloud/operator.py b/backend/btrixcloud/operator/crawls.py
similarity index 61%
rename from backend/btrixcloud/operator.py
rename to backend/btrixcloud/operator/crawls.py
index c90e6c8f98..c33afe1d59 100644
--- a/backend/btrixcloud/operator.py
+++ b/backend/btrixcloud/operator/crawls.py
@@ -1,63 +1,51 @@
-""" btrixjob operator (working for metacontroller) """
+""" CrawlOperator """
-import asyncio
import traceback
import os
from pprint import pprint
-from typing import Optional, DefaultDict, TYPE_CHECKING
-
-from collections import defaultdict
+from typing import Optional, Any
+from datetime import datetime
import json
-from uuid import UUID
-from fastapi import HTTPException
-import yaml
import humanize
-from pydantic import BaseModel, Field
-
from kubernetes.utils import parse_quantity
from redis import asyncio as exceptions
-from .utils import (
- from_k8s_date,
- to_k8s_date,
- dt_now,
-)
-from .k8sapi import K8sAPI
-
-from .models import (
+from btrixcloud.models import (
NON_RUNNING_STATES,
RUNNING_STATES,
RUNNING_AND_STARTING_ONLY,
RUNNING_AND_STARTING_STATES,
SUCCESSFUL_STATES,
+ FAILED_STATES,
+ CrawlStats,
CrawlFile,
CrawlCompleteIn,
StorageRef,
+ PagesAndResources,
+)
+
+from btrixcloud.utils import (
+ from_k8s_date,
+ to_k8s_date,
+ dt_now,
+)
+
+from .baseoperator import BaseOperator, Redis
+from .models import (
+ CrawlSpec,
+ CrawlStatus,
+ MCBaseRequest,
+ MCSyncData,
+ POD,
+ CMAP,
+ PVC,
+ CJS,
+ BTRIX_API,
)
-if TYPE_CHECKING:
- from .crawlconfigs import CrawlConfigOps
- from .crawls import CrawlOps
- from .orgs import OrgOps
- from .colls import CollectionOps
- from .storages import StorageOps
- from .webhooks import EventWebhookOps
- from .users import UserManager
- from .background_jobs import BackgroundJobOps
- from redis.asyncio.client import Redis
-else:
- CrawlConfigOps = CrawlOps = OrgOps = CollectionOps = Redis = object
- StorageOps = EventWebhookOps = UserManager = BackgroundJobOps = object
-
-CMAP = "ConfigMap.v1"
-PVC = "PersistentVolumeClaim.v1"
-POD = "Pod.v1"
-
-BTRIX_API = "btrix.cloud/v1"
-CJS = f"CrawlJob.{BTRIX_API}"
METRICS_API = "metrics.k8s.io/v1beta1"
METRICS = f"PodMetrics.{METRICS_API}"
@@ -73,309 +61,39 @@
EXEC_TIME_UPDATE_SECS = 60
+# pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements
+# pylint: disable=invalid-name, too-many-lines, too-many-return-statements
# ============================================================================
-class MCBaseRequest(BaseModel):
- """base metacontroller model, used for customize hook"""
-
- parent: dict
- controller: dict
-
-
-# ============================================================================
-class MCSyncData(MCBaseRequest):
- """sync / finalize metacontroller model"""
-
- children: dict
- related: dict
- finalizing: bool = False
-
-
-# ============================================================================
-class MCDecoratorSyncData(BaseModel):
- """sync for decoratorcontroller model"""
-
- object: dict
- controller: dict
-
- attachments: dict
- related: dict
- finalizing: bool = False
+class CrawlOperator(BaseOperator):
+ """CrawlOperator Handler"""
-
-# ============================================================================
-class CrawlSpec(BaseModel):
- """spec from k8s CrawlJob object"""
-
- id: str
- cid: UUID
- oid: UUID
- scale: int = 1
- storage: StorageRef
- started: str
- crawler_channel: str
- stopping: bool = False
- scheduled: bool = False
- timeout: int = 0
- max_crawl_size: int = 0
-
-
-# ============================================================================
-class PodResourcePercentage(BaseModel):
- """Resource usage percentage ratios"""
-
- memory: float = 0
- cpu: float = 0
- storage: float = 0
-
-
-# ============================================================================
-class PodResources(BaseModel):
- """Pod Resources"""
-
- memory: int = 0
- cpu: float = 0
- storage: int = 0
-
- def __init__(self, *a, **kw):
- if "memory" in kw:
- kw["memory"] = int(parse_quantity(kw["memory"]))
- if "cpu" in kw:
- kw["cpu"] = float(parse_quantity(kw["cpu"]))
- if "storage" in kw:
- kw["storage"] = int(parse_quantity(kw["storage"]))
- super().__init__(*a, **kw)
-
-
-# ============================================================================
-class PodInfo(BaseModel):
- """Aggregate pod status info held in CrawlJob"""
-
- exitTime: Optional[str] = None
- exitCode: Optional[int] = None
- isNewExit: Optional[bool] = Field(default=None, exclude=True)
- reason: Optional[str] = None
-
- allocated: PodResources = PodResources()
- used: PodResources = PodResources()
-
- newCpu: Optional[int] = None
- newMemory: Optional[int] = None
-
- def dict(self, *a, **kw):
- res = super().dict(*a, **kw)
- percent = {
- "memory": self.get_percent_memory(),
- "cpu": self.get_percent_cpu(),
- "storage": self.get_percent_storage(),
- }
- res["percent"] = percent
- return res
-
- def get_percent_memory(self) -> float:
- """compute percent memory used"""
- return (
- float(self.used.memory) / float(self.allocated.memory)
- if self.allocated.memory
- else 0
- )
-
- def get_percent_cpu(self) -> float:
- """compute percent cpu used"""
- return (
- float(self.used.cpu) / float(self.allocated.cpu)
- if self.allocated.cpu
- else 0
- )
-
- def get_percent_storage(self) -> float:
- """compute percent storage used"""
- return (
- float(self.used.storage) / float(self.allocated.storage)
- if self.allocated.storage
- else 0
- )
-
- def should_restart_pod(self):
- """return true if pod should be restarted"""
- if self.newMemory and self.newMemory != self.allocated.memory:
- return True
-
- if self.newCpu and self.newCpu != self.allocated.cpu:
- return True
-
- return False
-
-
-# ============================================================================
-class CrawlStatus(BaseModel):
- """status from k8s CrawlJob object"""
-
- state: str = "starting"
- pagesFound: int = 0
- pagesDone: int = 0
- size: int = 0
- # human readable size string
- sizeHuman: str = ""
- scale: int = 1
- filesAdded: int = 0
- filesAddedSize: int = 0
- finished: Optional[str] = None
- stopping: bool = False
- stopReason: Optional[str] = None
- initRedis: bool = False
- crawlerImage: Optional[str] = None
- lastActiveTime: str = ""
- podStatus: Optional[DefaultDict[str, PodInfo]] = defaultdict(
- lambda: PodInfo() # pylint: disable=unnecessary-lambda
- )
- # placeholder for pydantic 2.0 -- will require this version
- # podStatus: Optional[
- # DefaultDict[str, Annotated[PodInfo, Field(default_factory=PodInfo)]]
- # ]
- restartTime: Optional[str]
- canceled: bool = False
-
- # updated on pod exits and at regular interval
- # Crawl Execution Time -- time all crawler pods have been running
- # used to track resource usage and enforce execution minutes limit
- crawlExecTime: int = 0
-
- # Elapsed Exec Time -- time crawl has been running in at least one pod
- # used for crawl timeouts
- elapsedCrawlTime: int = 0
-
- # last exec time update
- lastUpdatedTime: str = ""
-
- # any pods exited
- anyCrawlPodNewExit: Optional[bool] = Field(default=False, exclude=True)
-
- # don't include in status, use by metacontroller
- resync_after: Optional[int] = Field(default=None, exclude=True)
-
-
-# ============================================================================
-# pylint: disable=too-many-statements, too-many-public-methods, too-many-branches, too-many-nested-blocks
-# pylint: disable=too-many-instance-attributes, too-many-locals, too-many-lines, too-many-arguments
-class BtrixOperator(K8sAPI):
- """BtrixOperator Handler"""
-
- crawl_config_ops: CrawlConfigOps
- crawl_ops: CrawlOps
- orgs_ops: OrgOps
- coll_ops: CollectionOps
- storage_ops: StorageOps
- event_webhook_ops: EventWebhookOps
- background_job_ops: BackgroundJobOps
- user_ops: UserManager
-
- def __init__(
- self,
- crawl_config_ops,
- crawl_ops,
- org_ops,
- coll_ops,
- storage_ops,
- event_webhook_ops,
- background_job_ops,
- ):
- super().__init__()
-
- self.crawl_config_ops = crawl_config_ops
- self.crawl_ops = crawl_ops
- self.org_ops = org_ops
- self.coll_ops = coll_ops
- self.storage_ops = storage_ops
- self.background_job_ops = background_job_ops
- self.event_webhook_ops = event_webhook_ops
-
- self.user_ops = crawl_config_ops.user_manager
-
- self.config_file = "/config/config.yaml"
+ def __init__(self, *args):
+ super().__init__(*args)
self.done_key = "crawls-done"
+ self.pages_key = "pages"
+ self.errors_key = "e"
self.fast_retry_secs = int(os.environ.get("FAST_RETRY_SECS") or 0)
self.log_failed_crawl_lines = int(os.environ.get("LOG_FAILED_CRAWL_LINES") or 0)
- with open(self.config_file, encoding="utf-8") as fh_config:
- self.shared_params = yaml.safe_load(fh_config)
-
- self._has_pod_metrics = False
- self.compute_crawler_resources()
-
- # to avoid background tasks being garbage collected
- # see: https://stackoverflow.com/a/74059981
- self.bg_tasks = set()
-
- def compute_crawler_resources(self):
- """compute memory / cpu resources for crawlers"""
- # pylint: disable=invalid-name
- p = self.shared_params
- num = max(int(p["crawler_browser_instances"]) - 1, 0)
- if not p.get("crawler_cpu"):
- base = parse_quantity(p["crawler_cpu_base"])
- extra = parse_quantity(p["crawler_extra_cpu_per_browser"])
-
- # cpu is a floating value of cpu cores
- p["crawler_cpu"] = float(base + num * extra)
+ def init_routes(self, app):
+ """init routes for this operator"""
- print(f"cpu = {base} + {num} * {extra} = {p['crawler_cpu']}")
- else:
- print(f"cpu = {p['crawler_cpu']}")
-
- if not p.get("crawler_memory"):
- base = parse_quantity(p["crawler_memory_base"])
- extra = parse_quantity(p["crawler_extra_memory_per_browser"])
-
- # memory is always an int
- p["crawler_memory"] = int(base + num * extra)
-
- print(f"memory = {base} + {num} * {extra} = {p['crawler_memory']}")
- else:
- print(f"memory = {p['crawler_memory']}")
-
- async def async_init(self):
- """perform any async init here"""
- self._has_pod_metrics = await self.is_pod_metrics_available()
- print("Pod Metrics Available:", self._has_pod_metrics)
-
- async def sync_profile_browsers(self, data: MCSyncData):
- """sync profile browsers"""
- spec = data.parent.get("spec", {})
-
- expire_time = from_k8s_date(spec.get("expireTime"))
- browserid = spec.get("id")
-
- if dt_now() >= expire_time:
- self.run_task(self.delete_profile_browser(browserid))
- return {"status": {}, "children": []}
-
- params = {}
- params.update(self.shared_params)
- params["id"] = browserid
- params["userid"] = spec.get("userid", "")
-
- oid = spec.get("oid")
- storage = StorageRef(spec.get("storageName"))
-
- storage_path = storage.get_storage_extra_path(oid)
- storage_secret = storage.get_storage_secret_name(oid)
-
- params["storage_path"] = storage_path
- params["storage_secret"] = storage_secret
- params["profile_filename"] = spec.get("profileFilename", "")
- params["crawler_image"] = spec["crawlerImage"]
-
- params["url"] = spec.get("startUrl", "about:blank")
- params["vnc_password"] = spec.get("vncPassword")
+ @app.post("/op/crawls/sync")
+ async def mc_sync_crawls(data: MCSyncData):
+ return await self.sync_crawls(data)
- children = self.load_from_yaml("profilebrowser.yaml", params)
+ # reuse sync path, but distinct endpoint for better logging
+ @app.post("/op/crawls/finalize")
+ async def mc_sync_finalize(data: MCSyncData):
+ return await self.sync_crawls(data)
- return {"status": {}, "children": children}
+ @app.post("/op/crawls/customize")
+ async def mc_related(data: MCBaseRequest):
+ return self.get_related(data)
- # pylint: disable=too-many-return-statements, invalid-name
async def sync_crawls(self, data: MCSyncData):
"""sync crawls"""
@@ -386,31 +104,43 @@ async def sync_crawls(self, data: MCSyncData):
cid = spec["cid"]
oid = spec["oid"]
- redis_url = self.get_redis_url(crawl_id)
+ redis_url = self.k8s.get_redis_url(crawl_id)
params = {}
- params.update(self.shared_params)
+ params.update(self.k8s.shared_params)
params["id"] = crawl_id
params["cid"] = cid
params["userid"] = spec.get("userid", "")
pods = data.children[POD]
+ crawl = CrawlSpec(
+ id=crawl_id,
+ cid=cid,
+ oid=oid,
+ storage=StorageRef(spec["storageName"]),
+ crawler_channel=spec.get("crawlerChannel"),
+ scale=spec.get("scale", 1),
+ started=data.parent["metadata"]["creationTimestamp"],
+ stopping=spec.get("stopping", False),
+ timeout=spec.get("timeout") or 0,
+ max_crawl_size=int(spec.get("maxCrawlSize") or 0),
+ scheduled=spec.get("manual") != "1",
+ qa_source_crawl_id=spec.get("qaSourceCrawlId"),
+ )
+
# if finalizing, crawl is being deleted
if data.finalizing:
if not status.finished:
# if can't cancel, already finished
- if not await self.cancel_crawl(
- crawl_id, UUID(cid), UUID(oid), status, data.children[POD]
- ):
- # instead of fetching the state (that was already set)
- # return exception to ignore this request, keep previous
- # finished state
- raise HTTPException(status_code=400, detail="out_of_sync_status")
+ await self.cancel_crawl(crawl, status, data.children[POD])
+ # instead of fetching the state (that was already set)
+ # return exception to ignore this request, keep previous
+ # finished state
+ # raise HTTPException(status_code=400, detail="out_of_sync_status")
return await self.finalize_response(
- crawl_id,
- UUID(oid),
+ crawl,
status,
spec,
data.children,
@@ -423,10 +153,9 @@ async def sync_crawls(self, data: MCSyncData):
print(
f"warn crawl {crawl_id} finished but not deleted, post-finish taking too long?"
)
- self.run_task(self.delete_crawl_job(crawl_id))
+ self.run_task(self.k8s.delete_crawl_job(crawl.id))
return await self.finalize_response(
- crawl_id,
- UUID(oid),
+ crawl,
status,
spec,
data.children,
@@ -438,28 +167,14 @@ async def sync_crawls(self, data: MCSyncData):
# pylint: disable=bare-except, broad-except
except:
# fail crawl if config somehow missing, shouldn't generally happen
- await self.fail_crawl(crawl_id, UUID(cid), UUID(oid), status, pods)
+ await self.fail_crawl(crawl, status, pods)
return self._empty_response(status)
- crawl = CrawlSpec(
- id=crawl_id,
- cid=cid,
- oid=oid,
- storage=StorageRef(spec["storageName"]),
- crawler_channel=spec.get("crawlerChannel"),
- scale=spec.get("scale", 1),
- started=data.parent["metadata"]["creationTimestamp"],
- stopping=spec.get("stopping", False),
- timeout=spec.get("timeout") or 0,
- max_crawl_size=int(spec.get("maxCrawlSize") or 0),
- scheduled=spec.get("manual") != "1",
- )
-
# shouldn't get here, crawl should already be finalizing when canceled
# just in case, handle canceled-but-not-finalizing here
if status.state == "canceled":
- await self.delete_crawl_job(crawl.id)
+ await self.k8s.delete_crawl_job(crawl.id)
return {"status": status.dict(exclude_none=True), "children": []}
# first, check storage quota, and fail immediately if quota reached
@@ -471,9 +186,7 @@ async def sync_crawls(self, data: MCSyncData):
and not data.children[PVC]
and await self.org_ops.storage_quota_reached(crawl.oid)
):
- await self.mark_finished(
- crawl.id, crawl.cid, crawl.oid, status, "skipped_quota_reached"
- )
+ await self.mark_finished(crawl, status, "skipped_quota_reached")
return self._empty_response(status)
if status.state in ("starting", "waiting_org_limit"):
@@ -481,7 +194,7 @@ async def sync_crawls(self, data: MCSyncData):
return self._empty_response(status)
await self.set_state(
- "starting", status, crawl.id, allowed_from=["waiting_org_limit"]
+ "starting", status, crawl, allowed_from=["waiting_org_limit"]
)
if len(pods):
@@ -501,8 +214,7 @@ async def sync_crawls(self, data: MCSyncData):
if status.finished:
return await self.finalize_response(
- crawl_id,
- UUID(oid),
+ crawl,
status,
spec,
data.children,
@@ -510,21 +222,29 @@ async def sync_crawls(self, data: MCSyncData):
)
await self.increment_pod_exec_time(
- pods, status, crawl.id, crawl.oid, EXEC_TIME_UPDATE_SECS
+ pods, crawl, status, EXEC_TIME_UPDATE_SECS
)
else:
status.scale = crawl.scale
- status.lastUpdatedTime = to_k8s_date(dt_now())
+ now = dt_now()
+ await self.crawl_ops.inc_crawl_exec_time(
+ crawl.db_crawl_id, crawl.is_qa, 0, now
+ )
+ status.lastUpdatedTime = to_k8s_date(now)
children = self._load_redis(params, status, data.children)
storage_path = crawl.storage.get_storage_extra_path(oid)
storage_secret = crawl.storage.get_storage_secret_name(oid)
+ if not crawl.is_qa:
+ params["profile_filename"] = configmap["PROFILE_FILENAME"]
+ else:
+ storage_path += "qa/"
+
params["storage_path"] = storage_path
params["storage_secret"] = storage_secret
- params["profile_filename"] = configmap["PROFILE_FILENAME"]
# only resolve if not already set
# not automatically updating image for existing crawls
@@ -538,6 +258,8 @@ async def sync_crawls(self, data: MCSyncData):
params["storage_filename"] = configmap["STORE_FILENAME"]
params["restart_time"] = spec.get("restartTime")
+ params["warc_prefix"] = spec.get("warcPrefix")
+
params["redis_url"] = redis_url
if spec.get("restartTime") != status.restartTime:
@@ -548,6 +270,10 @@ async def sync_crawls(self, data: MCSyncData):
else:
params["force_restart"] = False
+ if crawl.qa_source_crawl_id:
+ params["qa_source_crawl_id"] = crawl.qa_source_crawl_id
+ children.extend(await self._load_qa_configmap(params, data.children))
+
for i in range(0, status.scale):
children.extend(self._load_crawler(params, i, status, data.children))
@@ -573,6 +299,25 @@ def _load_redis(self, params, status, children):
return self.load_from_yaml("redis.yaml", params)
+ async def _load_qa_configmap(self, params, children):
+ qa_source_crawl_id = params["qa_source_crawl_id"]
+ name = f"qa-replay-{qa_source_crawl_id}"
+
+ if name in children[CMAP]:
+ return [children[CMAP][name]]
+
+ pages, _ = await self.page_ops.list_pages(qa_source_crawl_id, page_size=1000)
+
+ crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id)
+
+ res_and_pages = PagesAndResources(resources=crawl_replay.resources, pages=pages)
+
+ params["name"] = name
+ params["qa_source_replay_json"] = res_and_pages.json()
+ # params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
+
+ return self.load_from_yaml("qa_configmap.yaml", params)
+
def _load_crawler(self, params, i, status, children):
name = f"crawl-{params['id']}-{i}"
has_pod = name in children[POD]
@@ -587,7 +332,10 @@ def _load_crawler(self, params, i, status, children):
if params.get("do_restart"):
print(f"Restart {name}")
- params["priorityClassName"] = f"crawl-instance-{i}"
+ if params.get("qa_source_crawl_id"):
+ params["priorityClassName"] = f"qa-crawl-instance-{i}"
+ else:
+ params["priorityClassName"] = f"crawl-instance-{i}"
return self.load_from_yaml("crawler.yaml", params)
@@ -661,7 +409,15 @@ def sync_resources(self, status, name, pod, children):
src = pvc["spec"]["resources"]["requests"]
resources.storage = int(parse_quantity(src.get("storage")))
- async def set_state(self, state, status, crawl_id, allowed_from, **kwargs):
+ async def set_state(
+ self,
+ state: str,
+ status: CrawlStatus,
+ crawl: CrawlSpec,
+ allowed_from: list[str],
+ finished: Optional[datetime] = None,
+ stats: Optional[CrawlStats] = None,
+ ):
"""set status state and update db, if changed
if allowed_from passed in, can only transition from allowed_from state,
otherwise get current state from db and return
@@ -688,15 +444,22 @@ async def set_state(self, state, status, crawl_id, allowed_from, **kwargs):
"""
if not allowed_from or status.state in allowed_from:
res = await self.crawl_ops.update_crawl_state_if_allowed(
- crawl_id, state=state, allowed_from=allowed_from, **kwargs
+ crawl.db_crawl_id,
+ crawl.is_qa,
+ state=state,
+ allowed_from=allowed_from,
+ finished=finished,
+ stats=stats,
)
if res:
- print(f"Setting state: {status.state} -> {state}, {crawl_id}")
+ print(f"Setting state: {status.state} -> {state}, {crawl.id}")
status.state = state
return True
# get actual crawl state
- actual_state, finished = await self.crawl_ops.get_crawl_state(crawl_id)
+ actual_state, finished = await self.crawl_ops.get_crawl_state(
+ crawl.db_crawl_id, crawl.is_qa
+ )
if actual_state:
status.state = actual_state
if finished:
@@ -709,16 +472,10 @@ async def set_state(self, state, status, crawl_id, allowed_from, **kwargs):
if status.state != state:
print(
- f"Not setting state: {status.state} -> {state}, {crawl_id} not allowed"
+ f"Not setting state: {status.state} -> {state}, {crawl.id} not allowed"
)
return False
- def load_from_yaml(self, filename, params):
- """load and parse k8s template from yaml file"""
- return list(
- yaml.safe_load_all(self.templates.env.get_template(filename).render(params))
- )
-
def get_related(self, data: MCBaseRequest):
"""return objects related to crawl pods"""
spec = data.parent.get("spec", {})
@@ -738,7 +495,7 @@ def get_related(self, data: MCBaseRequest):
},
]
- if self._has_pod_metrics:
+ if self.k8s.has_pod_metrics:
related_resources.append(
{
"apiVersion": METRICS_API,
@@ -782,23 +539,21 @@ async def can_start_new(self, crawl: CrawlSpec, data: MCSyncData, status):
i += 1
await self.set_state(
- "waiting_org_limit", status, crawl.id, allowed_from=["starting"]
+ "waiting_org_limit", status, crawl, allowed_from=["starting"]
)
return False
async def cancel_crawl(
self,
- crawl_id: str,
- cid: UUID,
- oid: UUID,
+ crawl: CrawlSpec,
status: CrawlStatus,
pods: dict,
) -> bool:
"""Mark crawl as canceled"""
- if not await self.mark_finished(crawl_id, cid, oid, status, "canceled"):
+ if not await self.mark_finished(crawl, status, "canceled"):
return False
- await self.mark_for_cancelation(crawl_id)
+ await self.mark_for_cancelation(crawl.id)
if not status.canceled:
for name, pod in pods.items():
@@ -823,19 +578,15 @@ async def cancel_crawl(
async def fail_crawl(
self,
- crawl_id: str,
- cid: UUID,
- oid: UUID,
+ crawl: CrawlSpec,
status: CrawlStatus,
pods: dict,
- stats=None,
+ stats: Optional[CrawlStats] = None,
) -> bool:
"""Mark crawl as failed, log crawl state and print crawl logs, if possible"""
prev_state = status.state
- if not await self.mark_finished(
- crawl_id, cid, oid, status, "failed", stats=stats
- ):
+ if not await self.mark_finished(crawl, status, "failed", stats=stats):
return False
if not self.log_failed_crawl_lines or prev_state == "failed":
@@ -847,7 +598,7 @@ async def fail_crawl(
print(f"============== POD STATUS: {name} ==============")
pprint(pods[name]["status"])
- self.run_task(self.print_pod_logs(pod_names, self.log_failed_crawl_lines))
+ self.run_task(self.k8s.print_pod_logs(pod_names, self.log_failed_crawl_lines))
return True
@@ -860,8 +611,7 @@ def _empty_response(self, status):
async def finalize_response(
self,
- crawl_id: str,
- oid: UUID,
+ crawl: CrawlSpec,
status: CrawlStatus,
spec: dict,
children: dict,
@@ -869,7 +619,7 @@ async def finalize_response(
):
"""ensure crawl id ready for deletion"""
- redis_pod = f"redis-{crawl_id}"
+ redis_pod = f"redis-{crawl.id}"
new_children = []
finalized = False
@@ -880,7 +630,7 @@ async def finalize_response(
# if has other pods, keep redis pod until they are removed
if len(pods) > 1:
new_children = self._load_redis(params, status, children)
- await self.increment_pod_exec_time(pods, status, crawl_id, oid)
+ await self.increment_pod_exec_time(pods, crawl, status)
# keep pvs until pods are removed
if new_children:
@@ -891,12 +641,15 @@ async def finalize_response(
if status.finished:
ttl = spec.get("ttlSecondsAfterFinished", DEFAULT_TTL)
finished = from_k8s_date(status.finished)
- if (dt_now() - finished).total_seconds() > ttl > 0:
- print("CrawlJob expired, deleting: " + crawl_id)
+ if (dt_now() - finished).total_seconds() > ttl >= 0:
+ print("CrawlJob expired, deleting: " + crawl.id)
finalized = True
else:
finalized = True
+ if finalized and crawl.is_qa:
+ await self.crawl_ops.qa_run_finished(crawl.db_crawl_id)
+
return {
"status": status.dict(exclude_none=True),
"children": new_children,
@@ -907,7 +660,7 @@ async def _get_redis(self, redis_url: str) -> Optional[Redis]:
"""init redis, ensure connectivity"""
redis = None
try:
- redis = await self.get_redis_client(redis_url)
+ redis = await self.k8s.get_redis_client(redis_url)
# test connection
await redis.ping()
return redis
@@ -942,34 +695,36 @@ async def sync_crawl_state(
if status.anyCrawlPodNewExit:
await self.log_crashes(crawl.id, status.podStatus, redis)
- if not crawler_running:
+ if not crawler_running or not redis:
+ # if either crawler is not running or redis is inaccessible
if self.should_mark_waiting(status.state, crawl.started):
+ # mark as waiting (if already running)
await self.set_state(
"waiting_capacity",
status,
- crawl.id,
+ crawl,
allowed_from=RUNNING_AND_STARTING_ONLY,
)
- # for now, don't reset redis once inited
- if status.lastActiveTime and (
- (dt_now() - from_k8s_date(status.lastActiveTime)).total_seconds()
- > REDIS_TTL
- ):
- print(
- f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs"
- )
- status.initRedis = False
-
- # if still running, resync after N seconds
- status.resync_after = self.fast_retry_secs
- return status
-
- status.initRedis = True
- status.lastActiveTime = to_k8s_date(dt_now())
+ if not crawler_running and redis:
+ # if crawler running, but no redis, stop redis instance until crawler
+ # is running
+ if status.lastActiveTime and (
+ (
+ dt_now() - from_k8s_date(status.lastActiveTime)
+ ).total_seconds()
+ > REDIS_TTL
+ ):
+ print(
+ f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs"
+ )
+ status.initRedis = False
+ elif crawler_running and not redis:
+ # if crawler is running, but no redis, init redis
+ status.initRedis = True
+ status.lastActiveTime = to_k8s_date(dt_now())
- if not redis:
- # if still running, resync after N seconds
+ # if no crawler / no redis, resync after N seconds
status.resync_after = self.fast_retry_secs
return status
@@ -979,7 +734,7 @@ async def sync_crawl_state(
if await self.set_state(
"running",
status,
- crawl.id,
+ crawl,
allowed_from=["starting", "waiting_capacity"],
):
self.run_task(
@@ -989,7 +744,6 @@ async def sync_crawl_state(
)
file_done = await redis.lpop(self.done_key)
-
while file_done:
msg = json.loads(file_done)
# add completed file
@@ -1000,6 +754,25 @@ async def sync_crawl_state(
# get next file done
file_done = await redis.lpop(self.done_key)
+ page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}")
+ qa_run_id = crawl.id if crawl.is_qa else None
+
+ while page_crawled:
+ print("PAGE DATA", flush=True)
+ print(page_crawled, flush=True)
+ page_dict = json.loads(page_crawled)
+ await self.page_ops.add_page_to_db(
+ page_dict, crawl.db_crawl_id, qa_run_id, crawl.oid
+ )
+ page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}")
+
+ crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}")
+ while crawl_error:
+ await self.crawl_ops.add_crawl_error(
+ crawl.db_crawl_id, crawl.is_qa, crawl_error
+ )
+ crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}")
+
# ensure filesAdded and filesAddedSize always set
status.filesAdded = int(await redis.get("filesAdded") or 0)
status.filesAddedSize = int(await redis.get("filesAddedSize") or 0)
@@ -1095,20 +868,24 @@ def handle_terminated_pod(self, name, role, status, terminated):
async def increment_pod_exec_time(
self,
pods: dict[str, dict],
+ crawl: CrawlSpec,
status: CrawlStatus,
- crawl_id: str,
- oid: UUID,
min_duration=0,
) -> None:
"""inc exec time tracking"""
now = dt_now()
- if not status.lastUpdatedTime:
+ update_start_time = await self.crawl_ops.get_crawl_exec_last_update_time(
+ crawl.db_crawl_id
+ )
+
+ if not update_start_time:
+ await self.crawl_ops.inc_crawl_exec_time(
+ crawl.db_crawl_id, crawl.is_qa, 0, now
+ )
status.lastUpdatedTime = to_k8s_date(now)
return
- update_start_time = from_k8s_date(status.lastUpdatedTime)
-
reason = None
update_duration = (now - update_start_time).total_seconds()
@@ -1182,8 +959,9 @@ async def increment_pod_exec_time(
max_duration = max(duration, max_duration)
if exec_time:
- await self.crawl_ops.inc_crawl_exec_time(crawl_id, exec_time)
- await self.org_ops.inc_org_time_stats(oid, exec_time, True)
+ await self.org_ops.inc_org_time_stats(
+ crawl.oid, exec_time, True, crawl.is_qa
+ )
status.crawlExecTime += exec_time
status.elapsedCrawlTime += max_duration
@@ -1192,6 +970,9 @@ async def increment_pod_exec_time(
flush=True,
)
+ await self.crawl_ops.inc_crawl_exec_time(
+ crawl.db_crawl_id, crawl.is_qa, exec_time, now
+ )
status.lastUpdatedTime = to_k8s_date(now)
def should_mark_waiting(self, state, started):
@@ -1216,7 +997,7 @@ async def add_used_stats(self, crawl_id, pod_status, redis, metrics):
pod_info.used.storage = storage
# if no pod metrics, get memory estimate from redis itself
- if not self._has_pod_metrics:
+ if not self.k8s.has_pod_metrics:
stats = await redis.info("memory")
pod_info.used.memory = int(stats.get("used_memory_rss", 0))
@@ -1292,7 +1073,13 @@ async def add_file_to_crawl(self, cc_data, crawl, redis):
await redis.incr("filesAddedSize", filecomplete.size)
- await self.crawl_ops.add_crawl_file(crawl.id, crawl_file, filecomplete.size)
+ await self.crawl_ops.add_crawl_file(
+ crawl.db_crawl_id, crawl.is_qa, crawl_file, filecomplete.size
+ )
+
+ # no replicas for QA for now
+ if crawl.is_qa:
+ return True
try:
await self.background_job_ops.create_replica_jobs(
@@ -1336,7 +1123,9 @@ async def is_crawl_stopping(
return None
- async def get_redis_crawl_stats(self, redis: Redis, crawl_id: str):
+ async def get_redis_crawl_stats(
+ self, redis: Redis, crawl_id: str
+ ) -> tuple[CrawlStats, dict[str, Any]]:
"""get page stats"""
try:
# crawler >0.9.0, done key is a value
@@ -1349,7 +1138,7 @@ async def get_redis_crawl_stats(self, redis: Redis, crawl_id: str):
sizes = await redis.hgetall(f"{crawl_id}:size")
archive_size = sum(int(x) for x in sizes.values())
- stats = {"found": pages_found, "done": pages_done, "size": archive_size}
+ stats = CrawlStats(found=pages_found, done=pages_done, size=archive_size)
return stats, sizes
async def update_crawl_state(
@@ -1365,15 +1154,17 @@ async def update_crawl_state(
stats, sizes = await self.get_redis_crawl_stats(redis, crawl.id)
# need to add size of previously completed WACZ files as well!
- stats["size"] += status.filesAddedSize
+ stats.size += status.filesAddedSize
# update status
- status.pagesDone = stats["done"]
- status.pagesFound = stats["found"]
- status.size = stats["size"]
+ status.pagesDone = stats.done
+ status.pagesFound = stats.found
+ status.size = stats.size
status.sizeHuman = humanize.naturalsize(status.size)
- await self.crawl_ops.update_running_crawl_stats(crawl.id, stats)
+ await self.crawl_ops.update_running_crawl_stats(
+ crawl.db_crawl_id, crawl.is_qa, stats
+ )
for key, value in sizes.items():
value = int(value)
@@ -1409,9 +1200,7 @@ async def update_crawl_state(
# check if one-page crawls actually succeeded
# if only one page found, and no files, assume failed
if status.pagesFound == 1 and not status.filesAdded:
- await self.fail_crawl(
- crawl.id, crawl.cid, crawl.oid, status, pods, stats
- )
+ await self.fail_crawl(crawl, status, pods, stats)
return status
if status.stopReason in ("stopped_by_user", "stopped_quota_reached"):
@@ -1419,21 +1208,15 @@ async def update_crawl_state(
else:
state = "complete"
- await self.mark_finished(
- crawl.id, crawl.cid, crawl.oid, status, state, crawl, stats
- )
+ await self.mark_finished(crawl, status, state, stats)
# check if all crawlers failed
elif status_count.get("failed", 0) >= crawl.scale:
# if stopping, and no pages finished, mark as canceled
if status.stopping and not status.pagesDone:
- await self.mark_finished(
- crawl.id, crawl.cid, crawl.oid, status, "canceled", crawl, stats
- )
+ await self.mark_finished(crawl, status, "canceled", stats)
else:
- await self.fail_crawl(
- crawl.id, crawl.cid, crawl.oid, status, pods, stats
- )
+ await self.fail_crawl(crawl, status, pods, stats)
# check for other statuses
else:
@@ -1450,7 +1233,7 @@ async def update_crawl_state(
new_status = "pending-wait"
if new_status:
await self.set_state(
- new_status, status, crawl.id, allowed_from=RUNNING_STATES
+ new_status, status, crawl, allowed_from=RUNNING_STATES
)
return status
@@ -1458,22 +1241,15 @@ async def update_crawl_state(
# pylint: disable=too-many-arguments
async def mark_finished(
self,
- crawl_id: str,
- cid: UUID,
- oid: UUID,
+ crawl: CrawlSpec,
status: CrawlStatus,
state: str,
- crawl=None,
- stats=None,
+ stats: Optional[CrawlStats] = None,
) -> bool:
"""mark crawl as finished, set finished timestamp and final state"""
finished = dt_now()
- kwargs = {"finished": finished}
- if stats:
- kwargs["stats"] = stats
-
if state in SUCCESSFUL_STATES:
allowed_from = RUNNING_STATES
else:
@@ -1481,7 +1257,12 @@ async def mark_finished(
# if set_state returns false, already set to same status, return
if not await self.set_state(
- state, status, crawl_id, allowed_from=allowed_from, **kwargs
+ state,
+ status,
+ crawl,
+ allowed_from=allowed_from,
+ finished=finished,
+ stats=stats,
):
print("already finished, ignoring mark_finished")
if not status.finished:
@@ -1491,44 +1272,63 @@ async def mark_finished(
status.finished = to_k8s_date(finished)
- if crawl and state in SUCCESSFUL_STATES:
+ if state in SUCCESSFUL_STATES:
await self.inc_crawl_complete_stats(crawl, finished)
- self.run_task(
- self.do_crawl_finished_tasks(
- crawl_id, cid, oid, status.filesAddedSize, state
- )
- )
+ # Regular Crawl Finished
+ if not crawl.is_qa:
+ self.run_task(self.do_crawl_finished_tasks(crawl, status, state))
+
+ # QA Run Finished
+ else:
+ self.run_task(self.do_qa_run_finished_tasks(crawl, state))
return True
# pylint: disable=too-many-arguments
async def do_crawl_finished_tasks(
self,
- crawl_id: str,
- cid: UUID,
- oid: UUID,
- files_added_size: int,
+ crawl: CrawlSpec,
+ status: CrawlStatus,
state: str,
) -> None:
"""Run tasks after crawl completes in asyncio.task coroutine."""
- await self.crawl_config_ops.stats_recompute_last(cid, files_added_size, 1)
+ await self.crawl_config_ops.stats_recompute_last(
+ crawl.cid, status.filesAddedSize, 1
+ )
+
+ if state in SUCCESSFUL_STATES and crawl.oid:
+ await self.org_ops.inc_org_bytes_stored(
+ crawl.oid, status.filesAddedSize, "crawl"
+ )
+ await self.coll_ops.add_successful_crawl_to_collections(crawl.id, crawl.cid)
- if state in SUCCESSFUL_STATES and oid:
- await self.org_ops.inc_org_bytes_stored(oid, files_added_size, "crawl")
- await self.coll_ops.add_successful_crawl_to_collections(crawl_id, cid)
+ if state in FAILED_STATES:
+ await self.crawl_ops.delete_crawl_files(crawl.id, crawl.oid)
+ await self.page_ops.delete_crawl_pages(crawl.id, crawl.oid)
await self.event_webhook_ops.create_crawl_finished_notification(
- crawl_id, oid, state
+ crawl.id, crawl.oid, state
)
- # add crawl errors to db
- await self.add_crawl_errors_to_db(crawl_id)
+ # finally, delete job
+ await self.k8s.delete_crawl_job(crawl.id)
+
+ # pylint: disable=too-many-arguments
+ async def do_qa_run_finished_tasks(
+ self,
+ crawl: CrawlSpec,
+ state: str,
+ ) -> None:
+ """Run tasks after qa run completes in asyncio.task coroutine."""
+
+ if state in FAILED_STATES:
+ await self.page_ops.delete_qa_run_from_pages(crawl.db_crawl_id, crawl.id)
# finally, delete job
- await self.delete_crawl_job(crawl_id)
+ await self.k8s.delete_crawl_job(crawl.id)
- async def inc_crawl_complete_stats(self, crawl, finished):
+ async def inc_crawl_complete_stats(self, crawl: CrawlSpec, finished: datetime):
"""Increment Crawl Stats"""
started = from_k8s_date(crawl.started)
@@ -1537,12 +1337,12 @@ async def inc_crawl_complete_stats(self, crawl, finished):
print(f"Duration: {duration}", flush=True)
- await self.org_ops.inc_org_time_stats(crawl.oid, duration)
+ await self.org_ops.inc_org_time_stats(crawl.oid, duration, False, crawl.is_qa)
async def mark_for_cancelation(self, crawl_id):
"""mark crawl as canceled in redis"""
try:
- redis_url = self.get_redis_url(crawl_id)
+ redis_url = self.k8s.get_redis_url(crawl_id)
redis = await self._get_redis(redis_url)
if not redis:
return False
@@ -1552,220 +1352,3 @@ async def mark_for_cancelation(self, crawl_id):
finally:
if redis:
await redis.close()
-
- async def add_crawl_errors_to_db(self, crawl_id, inc=100):
- """Pull crawl errors from redis and write to mongo db"""
- index = 0
- redis = None
- try:
- redis_url = self.get_redis_url(crawl_id)
- redis = await self._get_redis(redis_url)
- if not redis:
- return
-
- # ensure this only runs once
- if not await redis.setnx("errors-exported", "1"):
- return
-
- while True:
- skip = index * inc
- upper_bound = skip + inc - 1
- errors = await redis.lrange(f"{crawl_id}:e", skip, upper_bound)
- if not errors:
- break
-
- await self.crawl_ops.add_crawl_errors(crawl_id, errors)
-
- if len(errors) < inc:
- # If we have fewer than inc errors, we can assume this is the
- # last page of data to add.
- break
- index += 1
- # pylint: disable=bare-except
- except:
- # likely redis has already been deleted, so nothing to do
- pass
- finally:
- if redis:
- await redis.close()
-
- def get_cronjob_crawl_related(self, data: MCBaseRequest):
- """return configmap related to crawl"""
- labels = data.parent.get("metadata", {}).get("labels", {})
- cid = labels.get("btrix.crawlconfig")
- return {
- "relatedResources": [
- {
- "apiVersion": "v1",
- "resource": "configmaps",
- "labelSelector": {"matchLabels": {"btrix.crawlconfig": cid}},
- }
- ]
- }
-
- async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):
- """create crawljobs from a job object spawned by cronjob"""
-
- metadata = data.object["metadata"]
- labels = metadata.get("labels", {})
- cid = labels.get("btrix.crawlconfig")
-
- name = metadata.get("name")
- crawl_id = name
-
- actual_state, finished = await self.crawl_ops.get_crawl_state(crawl_id)
- if finished:
- status = None
- # mark job as completed
- if not data.object["status"].get("succeeded"):
- print("Cron Job Complete!", finished)
- status = {
- "succeeded": 1,
- "startTime": metadata.get("creationTimestamp"),
- "completionTime": to_k8s_date(finished),
- }
-
- return {
- "attachments": [],
- "annotations": {"finished": finished},
- "status": status,
- }
-
- configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
-
- oid = configmap.get("ORG_ID")
- userid = configmap.get("USER_ID")
-
- crawljobs = data.attachments[CJS]
-
- org = await self.org_ops.get_org_by_id(UUID(oid))
-
- crawl_id, crawljob = self.new_crawl_job_yaml(
- cid,
- userid=userid,
- oid=oid,
- storage=org.storage,
- crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
- scale=int(configmap.get("INITIAL_SCALE", 1)),
- crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
- max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
- manual=False,
- crawl_id=crawl_id,
- )
-
- attachments = list(yaml.safe_load_all(crawljob))
-
- if crawl_id in crawljobs:
- attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
-
- if not actual_state:
- # pylint: disable=duplicate-code
- crawlconfig = await self.crawl_config_ops.get_crawl_config(
- UUID(cid), UUID(oid)
- )
- if not crawlconfig:
- print(
- f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?"
- )
- return {"attachments": []}
-
- # db create
- user = await self.user_ops.get_by_id(UUID(userid))
- if not user:
- print(f"error: missing user for id {userid}")
- return {"attachments": []}
-
- await self.crawl_config_ops.add_new_crawl(
- crawl_id, crawlconfig, user, manual=False
- )
- print("Scheduled Crawl Created: " + crawl_id)
-
- return {
- "attachments": attachments,
- }
-
- async def finalize_background_job(self, data: MCDecoratorSyncData) -> dict:
- """handle finished background job"""
-
- metadata = data.object["metadata"]
- labels: dict[str, str] = metadata.get("labels", {})
- oid: str = labels.get("btrix.org") or ""
- job_type: str = labels.get("job_type") or ""
- job_id: str = metadata.get("name")
-
- status = data.object["status"]
- success = status.get("succeeded") == 1
- completion_time = status.get("completionTime")
-
- finalized = True
-
- finished = from_k8s_date(completion_time) if completion_time else dt_now()
-
- try:
- await self.background_job_ops.job_finished(
- job_id, job_type, UUID(oid), success=success, finished=finished
- )
- # print(
- # f"{job_type} background job completed: success: {success}, {job_id}",
- # flush=True,
- # )
-
- # pylint: disable=broad-except
- except Exception:
- print("Update Background Job Error", flush=True)
- traceback.print_exc()
-
- return {"attachments": [], "finalized": finalized}
-
- def run_task(self, func):
- """add bg tasks to set to avoid premature garbage collection"""
- task = asyncio.create_task(func)
- self.bg_tasks.add(task)
- task.add_done_callback(self.bg_tasks.discard)
-
-
-# ============================================================================
-def init_operator_api(app, *args):
- """regsiters webhook handlers for metacontroller"""
-
- oper = BtrixOperator(*args)
-
- @app.post("/op/crawls/sync")
- async def mc_sync_crawls(data: MCSyncData):
- return await oper.sync_crawls(data)
-
- # reuse sync path, but distinct endpoint for better logging
- @app.post("/op/crawls/finalize")
- async def mc_sync_finalize(data: MCSyncData):
- return await oper.sync_crawls(data)
-
- @app.post("/op/crawls/customize")
- async def mc_related(data: MCBaseRequest):
- return oper.get_related(data)
-
- @app.post("/op/profilebrowsers/sync")
- async def mc_sync_profile_browsers(data: MCSyncData):
- return await oper.sync_profile_browsers(data)
-
- @app.post("/op/cronjob/sync")
- async def mc_sync_cronjob_crawls(data: MCDecoratorSyncData):
- return await oper.sync_cronjob_crawl(data)
-
- @app.post("/op/cronjob/customize")
- async def mc_cronjob_related(data: MCBaseRequest):
- return oper.get_cronjob_crawl_related(data)
-
- # nop, but needed for metacontroller
- @app.post("/op/backgroundjob/sync")
- async def mc_sync_background_jobs():
- return {"attachments": []}
-
- @app.post("/op/backgroundjob/finalize")
- async def mc_finalize_background_jobs(data: MCDecoratorSyncData):
- return await oper.finalize_background_job(data)
-
- @app.get("/healthz", include_in_schema=False)
- async def healthz():
- return {}
-
- return oper
diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py
new file mode 100644
index 0000000000..445e86fbca
--- /dev/null
+++ b/backend/btrixcloud/operator/cronjobs.py
@@ -0,0 +1,130 @@
+""" Operator handler for crawl CronJobs """
+
+from uuid import UUID
+import yaml
+
+from btrixcloud.utils import to_k8s_date
+from .models import MCBaseRequest, MCDecoratorSyncData, CJS, CMAP
+from .baseoperator import BaseOperator
+
+
+# pylint: disable=too-many-locals
+# ============================================================================
+class CronJobOperator(BaseOperator):
+ """CronJob Operator"""
+
+ def init_routes(self, app):
+ """init routes for crawl CronJob decorator"""
+
+ @app.post("/op/cronjob/sync")
+ async def mc_sync_cronjob_crawls(data: MCDecoratorSyncData):
+ return await self.sync_cronjob_crawl(data)
+
+ @app.post("/op/cronjob/customize")
+ async def mc_cronjob_related(data: MCBaseRequest):
+ return self.get_cronjob_crawl_related(data)
+
+ def get_cronjob_crawl_related(self, data: MCBaseRequest):
+ """return configmap related to crawl"""
+ labels = data.parent.get("metadata", {}).get("labels", {})
+ cid = labels.get("btrix.crawlconfig")
+ return {
+ "relatedResources": [
+ {
+ "apiVersion": "v1",
+ "resource": "configmaps",
+ "labelSelector": {"matchLabels": {"btrix.crawlconfig": cid}},
+ }
+ ]
+ }
+
+ async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):
+ """create crawljobs from a job object spawned by cronjob"""
+
+ metadata = data.object["metadata"]
+ labels = metadata.get("labels", {})
+ cid = labels.get("btrix.crawlconfig")
+
+ name = metadata.get("name")
+ crawl_id = name
+
+ actual_state, finished = await self.crawl_ops.get_crawl_state(
+ crawl_id, is_qa=False
+ )
+ if finished:
+ status = None
+ # mark job as completed
+ if not data.object["status"].get("succeeded"):
+ print("Cron Job Complete!", finished)
+ status = {
+ "succeeded": 1,
+ "startTime": metadata.get("creationTimestamp"),
+ "completionTime": to_k8s_date(finished),
+ }
+
+ return {
+ "attachments": [],
+ "annotations": {"finished": finished},
+ "status": status,
+ }
+
+ configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
+
+ oid = configmap.get("ORG_ID")
+ userid = configmap.get("USER_ID")
+
+ crawljobs = data.attachments[CJS]
+
+ org = await self.org_ops.get_org_by_id(UUID(oid))
+
+ warc_prefix = None
+
+ if not actual_state:
+ # cronjob doesn't exist yet
+ crawlconfig = await self.crawl_config_ops.get_crawl_config(
+ UUID(cid), UUID(oid)
+ )
+ if not crawlconfig:
+ print(
+ f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?"
+ )
+ return {"attachments": []}
+
+ # db create
+ user = await self.user_ops.get_by_id(UUID(userid))
+ if not user:
+ print(f"error: missing user for id {userid}")
+ return {"attachments": []}
+
+ warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
+
+ await self.crawl_config_ops.add_new_crawl(
+ crawl_id,
+ crawlconfig,
+ user,
+ manual=False,
+ )
+ print("Scheduled Crawl Created: " + crawl_id)
+
+ crawl_id, crawljob = self.k8s.new_crawl_job_yaml(
+ cid,
+ userid=userid,
+ oid=oid,
+ storage=org.storage,
+ crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
+ scale=int(configmap.get("INITIAL_SCALE", 1)),
+ crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
+ max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
+ manual=False,
+ crawl_id=crawl_id,
+ warc_prefix=warc_prefix,
+ )
+
+ attachments = list(yaml.safe_load_all(crawljob))
+
+ if crawl_id in crawljobs:
+ attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
+
+ return {
+ "attachments": attachments,
+ }
diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py
new file mode 100644
index 0000000000..f5a2f41473
--- /dev/null
+++ b/backend/btrixcloud/operator/models.py
@@ -0,0 +1,209 @@
+""" Operator Models """
+
+from collections import defaultdict
+from uuid import UUID
+from typing import Optional, DefaultDict
+from pydantic import BaseModel, Field
+from kubernetes.utils import parse_quantity
+from btrixcloud.models import StorageRef
+
+
+BTRIX_API = "btrix.cloud/v1"
+
+CMAP = "ConfigMap.v1"
+PVC = "PersistentVolumeClaim.v1"
+POD = "Pod.v1"
+CJS = f"CrawlJob.{BTRIX_API}"
+
+
+# ============================================================================
+class MCBaseRequest(BaseModel):
+ """base metacontroller model, used for customize hook"""
+
+ parent: dict
+ controller: dict
+
+
+# ============================================================================
+class MCSyncData(MCBaseRequest):
+ """sync / finalize metacontroller model"""
+
+ children: dict
+ related: dict
+ finalizing: bool = False
+
+
+# ============================================================================
+class MCDecoratorSyncData(BaseModel):
+ """sync for decoratorcontroller model"""
+
+ object: dict
+ controller: dict
+
+ attachments: dict
+ related: dict
+ finalizing: bool = False
+
+
+# ============================================================================
+class CrawlSpec(BaseModel):
+ """spec from k8s CrawlJob object"""
+
+ id: str
+ cid: UUID
+ oid: UUID
+ scale: int = 1
+ storage: StorageRef
+ started: str
+ crawler_channel: str
+ stopping: bool = False
+ scheduled: bool = False
+ timeout: int = 0
+ max_crawl_size: int = 0
+ qa_source_crawl_id: Optional[str] = ""
+
+ @property
+ def db_crawl_id(self) -> str:
+ """return actual crawl_id for db, if qa run"""
+ return self.qa_source_crawl_id or self.id
+
+ @property
+ def is_qa(self) -> bool:
+ """return true if qa run"""
+ return bool(self.qa_source_crawl_id)
+
+
+# ============================================================================
+class PodResourcePercentage(BaseModel):
+ """Resource usage percentage ratios"""
+
+ memory: float = 0
+ cpu: float = 0
+ storage: float = 0
+
+
+# ============================================================================
+class PodResources(BaseModel):
+ """Pod Resources"""
+
+ memory: int = 0
+ cpu: float = 0
+ storage: int = 0
+
+ def __init__(self, *a, **kw):
+ if "memory" in kw:
+ kw["memory"] = int(parse_quantity(kw["memory"]))
+ if "cpu" in kw:
+ kw["cpu"] = float(parse_quantity(kw["cpu"]))
+ if "storage" in kw:
+ kw["storage"] = int(parse_quantity(kw["storage"]))
+ super().__init__(*a, **kw)
+
+
+# ============================================================================
+class PodInfo(BaseModel):
+ """Aggregate pod status info held in CrawlJob"""
+
+ exitTime: Optional[str] = None
+ exitCode: Optional[int] = None
+ isNewExit: Optional[bool] = Field(default=None, exclude=True)
+ reason: Optional[str] = None
+
+ allocated: PodResources = PodResources()
+ used: PodResources = PodResources()
+
+ newCpu: Optional[int] = None
+ newMemory: Optional[int] = None
+
+ def dict(self, *a, **kw):
+ res = super().dict(*a, **kw)
+ percent = {
+ "memory": self.get_percent_memory(),
+ "cpu": self.get_percent_cpu(),
+ "storage": self.get_percent_storage(),
+ }
+ res["percent"] = percent
+ return res
+
+ def get_percent_memory(self) -> float:
+ """compute percent memory used"""
+ return (
+ float(self.used.memory) / float(self.allocated.memory)
+ if self.allocated.memory
+ else 0
+ )
+
+ def get_percent_cpu(self) -> float:
+ """compute percent cpu used"""
+ return (
+ float(self.used.cpu) / float(self.allocated.cpu)
+ if self.allocated.cpu
+ else 0
+ )
+
+ def get_percent_storage(self) -> float:
+ """compute percent storage used"""
+ return (
+ float(self.used.storage) / float(self.allocated.storage)
+ if self.allocated.storage
+ else 0
+ )
+
+ def should_restart_pod(self):
+ """return true if pod should be restarted"""
+ if self.newMemory and self.newMemory != self.allocated.memory:
+ return True
+
+ if self.newCpu and self.newCpu != self.allocated.cpu:
+ return True
+
+ return False
+
+
+# ============================================================================
+# pylint: disable=invalid-name
+class CrawlStatus(BaseModel):
+ """status from k8s CrawlJob object"""
+
+ state: str = "starting"
+ pagesFound: int = 0
+ pagesDone: int = 0
+ size: int = 0
+ # human readable size string
+ sizeHuman: str = ""
+ scale: int = 1
+ filesAdded: int = 0
+ filesAddedSize: int = 0
+ finished: Optional[str] = None
+ stopping: bool = False
+ stopReason: Optional[str] = None
+ initRedis: bool = False
+ crawlerImage: Optional[str] = None
+ lastActiveTime: str = ""
+ podStatus: Optional[DefaultDict[str, PodInfo]] = defaultdict(
+ lambda: PodInfo() # pylint: disable=unnecessary-lambda
+ )
+ # placeholder for pydantic 2.0 -- will require this version
+ # podStatus: Optional[
+ # DefaultDict[str, Annotated[PodInfo, Field(default_factory=PodInfo)]]
+ # ]
+ restartTime: Optional[str]
+ canceled: bool = False
+
+ # updated on pod exits and at regular interval
+ # Crawl Execution Time -- time all crawler pods have been running
+ # used to track resource usage and enforce execution minutes limit
+ crawlExecTime: int = 0
+
+ # Elapsed Exec Time -- time crawl has been running in at least one pod
+ # used for crawl timeouts
+ elapsedCrawlTime: int = 0
+
+ # last exec time update
+ lastUpdatedTime: str = ""
+
+ # any pods exited
+ anyCrawlPodNewExit: Optional[bool] = Field(default=False, exclude=True)
+
+ # don't include in status, use by metacontroller
+ resync_after: Optional[int] = Field(default=None, exclude=True)
diff --git a/backend/btrixcloud/operator/profiles.py b/backend/btrixcloud/operator/profiles.py
new file mode 100644
index 0000000000..713252d7c5
--- /dev/null
+++ b/backend/btrixcloud/operator/profiles.py
@@ -0,0 +1,57 @@
+""" Operator handler for ProfileJobs """
+
+from btrixcloud.utils import (
+ from_k8s_date,
+ dt_now,
+)
+
+from btrixcloud.models import StorageRef
+
+from .models import MCSyncData
+from .baseoperator import BaseOperator
+
+
+# ============================================================================
+class ProfileOperator(BaseOperator):
+ """ProfileOperator"""
+
+ def init_routes(self, app):
+ """init routes for this operator"""
+
+ @app.post("/op/profilebrowsers/sync")
+ async def mc_sync_profile_browsers(data: MCSyncData):
+ return await self.sync_profile_browsers(data)
+
+ async def sync_profile_browsers(self, data: MCSyncData):
+ """sync profile browsers"""
+ spec = data.parent.get("spec", {})
+
+ expire_time = from_k8s_date(spec.get("expireTime"))
+ browserid = spec.get("id")
+
+ if dt_now() >= expire_time:
+ self.run_task(self.k8s.delete_profile_browser(browserid))
+ return {"status": {}, "children": []}
+
+ params = {}
+ params.update(self.k8s.shared_params)
+ params["id"] = browserid
+ params["userid"] = spec.get("userid", "")
+
+ oid = spec.get("oid")
+ storage = StorageRef(spec.get("storageName"))
+
+ storage_path = storage.get_storage_extra_path(oid)
+ storage_secret = storage.get_storage_secret_name(oid)
+
+ params["storage_path"] = storage_path
+ params["storage_secret"] = storage_secret
+ params["profile_filename"] = spec.get("profileFilename", "")
+ params["crawler_image"] = spec["crawlerImage"]
+
+ params["url"] = spec.get("startUrl", "about:blank")
+ params["vnc_password"] = spec.get("vncPassword")
+
+ children = self.load_from_yaml("profilebrowser.yaml", params)
+
+ return {"status": {}, "children": children}
diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py
index 01a31551ee..a90cafcba0 100644
--- a/backend/btrixcloud/orgs.py
+++ b/backend/btrixcloud/orgs.py
@@ -1,6 +1,7 @@
"""
Organization API handling
"""
+
# pylint: disable=too-many-lines
import math
@@ -21,6 +22,7 @@
SUCCESSFUL_STATES,
RUNNING_STATES,
STARTING_STATES,
+ BaseCrawl,
Organization,
StorageRef,
OrgQuotas,
@@ -520,18 +522,22 @@ async def set_origin(self, org: Organization, request: Request):
{"_id": org.id}, {"$set": {"origin": origin}}
)
- async def inc_org_time_stats(self, oid, duration, is_exec_time=False):
+ async def inc_org_time_stats(self, oid, duration, is_exec_time=False, is_qa=False):
"""inc crawl duration stats for org
Overage is applied only to crawlExecSeconds - monthlyExecSeconds,
giftedExecSeconds, and extraExecSeconds are added to only up to quotas
+
+ If is_qa is true, also update seperate qa only counter
"""
- # pylint: disable=too-many-return-statements
+ # pylint: disable=too-many-return-statements, too-many-locals
key = "crawlExecSeconds" if is_exec_time else "usage"
yymm = datetime.utcnow().strftime("%Y-%m")
- await self.orgs.find_one_and_update(
- {"_id": oid}, {"$inc": {f"{key}.{yymm}": duration}}
- )
+ inc_query = {f"{key}.{yymm}": duration}
+ if is_qa:
+ qa_key = "qaCrawlExecSeconds" if is_exec_time else "qaUsage"
+ inc_query[f"{qa_key}.{yymm}"] = duration
+ await self.orgs.find_one_and_update({"_id": oid}, {"$inc": inc_query})
if not is_exec_time:
return
@@ -630,17 +636,17 @@ async def get_org_metrics(self, org: Organization):
upload_count = 0
page_count = 0
- async for item in self.crawls_db.find({"oid": org.id}):
- if item["state"] not in SUCCESSFUL_STATES:
+ async for item_data in self.crawls_db.find({"oid": org.id}):
+ item = BaseCrawl.from_dict(item_data)
+ if item.state not in SUCCESSFUL_STATES:
continue
archived_item_count += 1
- type_ = item.get("type")
- if type_ == "crawl":
+ if item.type == "crawl":
crawl_count += 1
- if type_ == "upload":
+ if item.type == "upload":
upload_count += 1
- if item.get("stats"):
- page_count += item.get("stats", {}).get("done", 0)
+ if item.stats:
+ page_count += item.stats.done
profile_count = await self.profiles_db.count_documents({"oid": org.id})
workflows_running_count = await self.crawls_db.count_documents(
diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
new file mode 100644
index 0000000000..231ff9cb58
--- /dev/null
+++ b/backend/btrixcloud/pages.py
@@ -0,0 +1,631 @@
+"""crawl pages"""
+
+import asyncio
+import traceback
+from datetime import datetime
+from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
+from uuid import UUID, uuid4
+
+from fastapi import Depends, HTTPException
+import pymongo
+
+from .models import (
+ Page,
+ PageOut,
+ PageOutWithSingleQA,
+ PageReviewUpdate,
+ PageQACompare,
+ Organization,
+ PaginatedResponse,
+ User,
+ PageNote,
+ PageNoteIn,
+ PageNoteEdit,
+ PageNoteDelete,
+)
+from .pagination import DEFAULT_PAGE_SIZE, paginated_format
+from .utils import from_k8s_date
+
+if TYPE_CHECKING:
+ from .crawls import CrawlOps
+ from .orgs import OrgOps
+ from .storages import StorageOps
+else:
+ CrawlOps = StorageOps = OrgOps = object
+
+
+# ============================================================================
+# pylint: disable=too-many-instance-attributes, too-many-arguments
+class PageOps:
+ """crawl pages"""
+
+ crawl_ops: CrawlOps
+ org_ops: OrgOps
+ storage_ops: StorageOps
+
+ def __init__(self, mdb, crawl_ops, org_ops, storage_ops):
+ self.pages = mdb["pages"]
+ self.crawls = mdb["crawls"]
+ self.crawl_ops = crawl_ops
+ self.org_ops = org_ops
+ self.storage_ops = storage_ops
+
+ async def init_index(self):
+ """init index for pages db collection"""
+ await self.pages.create_index([("crawl_id", pymongo.HASHED)])
+
+ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
+ """Add pages to database from WACZ files"""
+ pages_buffer: List[Page] = []
+ try:
+ crawl = await self.crawl_ops.get_crawl_out(crawl_id)
+ stream = await self.storage_ops.sync_stream_wacz_pages(
+ crawl.resources or []
+ )
+ for page_dict in stream:
+ if not page_dict.get("url"):
+ continue
+
+ if len(pages_buffer) > batch_size:
+ await self._add_pages_to_db(pages_buffer)
+
+ pages_buffer.append(
+ self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
+ )
+
+ # Add any remaining pages in buffer to db
+ if pages_buffer:
+ await self._add_pages_to_db(pages_buffer)
+
+ print(f"Added pages for crawl {crawl_id} to db", flush=True)
+ # pylint: disable=broad-exception-caught, raise-missing-from
+ except Exception as err:
+ traceback.print_exc()
+ print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
+
+ def _get_page_from_dict(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID):
+ """Return Page object from dict"""
+ page_id = page_dict.get("id")
+ if not page_id:
+ print(f'Page {page_dict.get("url")} has no id - assigning UUID', flush=True)
+
+ status = page_dict.get("status")
+ if not status and page_dict.get("loadState"):
+ status = 200
+
+ return Page(
+ id=page_id,
+ oid=oid,
+ crawl_id=crawl_id,
+ url=page_dict.get("url"),
+ title=page_dict.get("title"),
+ loadState=page_dict.get("loadState"),
+ status=status,
+ ts=(
+ from_k8s_date(page_dict.get("ts"))
+ if page_dict.get("ts")
+ else datetime.now()
+ ),
+ )
+
+ async def _add_pages_to_db(self, pages: List[Page]):
+ """Add batch of pages to db in one insert"""
+ result = await self.pages.insert_many(
+ [
+ page.to_dict(
+ exclude_unset=True, exclude_none=True, exclude_defaults=True
+ )
+ for page in pages
+ ]
+ )
+ if not result.inserted_ids:
+ # pylint: disable=broad-exception-raised
+ raise Exception("No pages inserted")
+
+ async def add_page_to_db(
+ self,
+ page_dict: Dict[str, Any],
+ crawl_id: str,
+ qa_run_id: Optional[str],
+ oid: UUID,
+ ):
+ """Add page to database"""
+ page = self._get_page_from_dict(page_dict, crawl_id, oid)
+
+ try:
+ await self.pages.insert_one(
+ page.to_dict(
+ exclude_unset=True, exclude_none=True, exclude_defaults=True
+ )
+ )
+ except pymongo.errors.DuplicateKeyError:
+ pass
+
+ # pylint: disable=broad-except
+ except Exception as err:
+ print(
+ f"Error adding page {page.id} from crawl {crawl_id} to db: {err}",
+ flush=True,
+ )
+ return
+
+ # qa data
+ if qa_run_id and page:
+ compare_dict = page_dict.get("comparison")
+ if compare_dict is None:
+ print("QA Run, but compare data missing!")
+ return
+
+ compare = PageQACompare(**compare_dict)
+ print("Adding QA Run Data for Page", page_dict.get("url"), compare)
+
+ await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
+
+ async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
+ """Delete crawl pages from db"""
+ query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
+ if oid:
+ query["oid"] = oid
+ try:
+ await self.pages.delete_many(query)
+ # pylint: disable=broad-except
+ except Exception as err:
+ print(
+ f"Error deleting pages from crawl {crawl_id}: {err}",
+ flush=True,
+ )
+
+ async def get_page_raw(
+ self,
+ page_id: UUID,
+ oid: UUID,
+ crawl_id: Optional[str] = None,
+ ) -> Dict[str, Any]:
+ """Return page dict by id"""
+ query: Dict[str, Union[str, UUID]] = {"_id": page_id, "oid": oid}
+ if crawl_id:
+ query["crawl_id"] = crawl_id
+
+ page = await self.pages.find_one(query)
+ if not page:
+ raise HTTPException(status_code=404, detail="page_not_found")
+ return page
+
+ async def get_page(
+ self,
+ page_id: UUID,
+ oid: UUID,
+ crawl_id: Optional[str] = None,
+ ) -> Page:
+ """Return Page object by id"""
+ page_raw = await self.get_page_raw(page_id, oid, crawl_id)
+ return Page.from_dict(page_raw)
+
+ async def add_qa_run_for_page(
+ self, page_id: UUID, oid: UUID, qa_run_id: str, compare: PageQACompare
+ ) -> bool:
+ """Update page heuristics and mime/type from QA run"""
+
+ # modified = datetime.utcnow().replace(microsecond=0, tzinfo=None)
+
+ result = await self.pages.find_one_and_update(
+ {"_id": page_id, "oid": oid},
+ {"$set": {f"qa.{qa_run_id}": compare.dict()}},
+ return_document=pymongo.ReturnDocument.AFTER,
+ )
+
+ if not result:
+ raise HTTPException(status_code=404, detail="page_not_found")
+
+ return True
+
+ async def delete_qa_run_from_pages(self, crawl_id: str, qa_run_id: str):
+ """delete pages"""
+ result = await self.pages.update_many(
+ {"crawl_id": crawl_id}, {"$unset": {f"qa.{qa_run_id}": ""}}
+ )
+ return result
+
+ async def update_page_approval(
+ self,
+ page_id: UUID,
+ oid: UUID,
+ approved: Optional[bool] = None,
+ crawl_id: Optional[str] = None,
+ user: Optional[User] = None,
+ ) -> Dict[str, bool]:
+ """Update page manual review"""
+ query: Dict[str, Union[Optional[bool], str, datetime, UUID]] = {
+ "approved": approved
+ }
+ query["modified"] = datetime.utcnow().replace(microsecond=0, tzinfo=None)
+ if user:
+ query["userid"] = user.id
+
+ result = await self.pages.find_one_and_update(
+ {"_id": page_id, "oid": oid, "crawl_id": crawl_id},
+ {"$set": query},
+ return_document=pymongo.ReturnDocument.AFTER,
+ )
+
+ if not result:
+ raise HTTPException(status_code=404, detail="page_not_found")
+
+ return {"updated": True}
+
+ async def add_page_note(
+ self,
+ page_id: UUID,
+ oid: UUID,
+ text: str,
+ user: User,
+ crawl_id: str,
+ ) -> Dict[str, bool]:
+ """Add note to page"""
+ note = PageNote(id=uuid4(), text=text, userid=user.id, userName=user.name)
+
+ modified = datetime.utcnow().replace(microsecond=0, tzinfo=None)
+
+ result = await self.pages.find_one_and_update(
+ {"_id": page_id, "oid": oid, "crawl_id": crawl_id},
+ {
+ "$push": {"notes": note.dict()},
+ "$set": {"modified": modified},
+ },
+ return_document=pymongo.ReturnDocument.AFTER,
+ )
+
+ if not result:
+ raise HTTPException(status_code=404, detail="page_not_found")
+
+ return {"added": True}
+
+ async def update_page_note(
+ self,
+ page_id: UUID,
+ oid: UUID,
+ note_in: PageNoteEdit,
+ user: User,
+ crawl_id: str,
+ ) -> Dict[str, bool]:
+ """Update specific page note"""
+ page = await self.get_page_raw(page_id, oid)
+ page_notes = page.get("notes", [])
+
+ try:
+ matching_index = [
+ index
+ for index, note in enumerate(page_notes)
+ if note["id"] == note_in.id
+ ][0]
+
+ except IndexError:
+ # pylint: disable=raise-missing-from
+ raise HTTPException(status_code=404, detail="page_note_not_found")
+
+ new_note = PageNote(
+ id=note_in.id, text=note_in.text, userid=user.id, userName=user.name
+ )
+ page_notes[matching_index] = new_note.dict()
+
+ modified = datetime.utcnow().replace(microsecond=0, tzinfo=None)
+
+ result = await self.pages.find_one_and_update(
+ {"_id": page_id, "oid": oid, "crawl_id": crawl_id},
+ {"$set": {"notes": page_notes, "modified": modified}},
+ return_document=pymongo.ReturnDocument.AFTER,
+ )
+
+ if not result:
+ raise HTTPException(status_code=404, detail="page_not_found")
+
+ return {"updated": True}
+
+ async def delete_page_notes(
+ self,
+ page_id: UUID,
+ oid: UUID,
+ delete: PageNoteDelete,
+ crawl_id: str,
+ ) -> Dict[str, bool]:
+ """Delete specific page notes"""
+ page = await self.get_page_raw(page_id, oid)
+ page_notes = page.get("notes", [])
+
+ remaining_notes = []
+ for note in page_notes:
+ if not note.get("id") in delete.delete_list:
+ remaining_notes.append(note)
+
+ modified = datetime.utcnow().replace(microsecond=0, tzinfo=None)
+
+ result = await self.pages.find_one_and_update(
+ {"_id": page_id, "oid": oid, "crawl_id": crawl_id},
+ {"$set": {"notes": remaining_notes, "modified": modified}},
+ return_document=pymongo.ReturnDocument.AFTER,
+ )
+
+ if not result:
+ raise HTTPException(status_code=404, detail="page_not_found")
+
+ return {"deleted": True}
+
+ async def list_pages(
+ self,
+ crawl_id: str,
+ org: Optional[Organization] = None,
+ qa_run_id: Optional[str] = None,
+ qa_filter_by: Optional[str] = None,
+ qa_gte: Optional[float] = None,
+ qa_gt: Optional[float] = None,
+ qa_lte: Optional[float] = None,
+ qa_lt: Optional[float] = None,
+ page_size: int = DEFAULT_PAGE_SIZE,
+ page: int = 1,
+ sort_by: Optional[str] = None,
+ sort_direction: Optional[int] = -1,
+ ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]:
+ """List all pages in crawl"""
+ # pylint: disable=duplicate-code, too-many-locals, too-many-branches
+ # Zero-index page for query
+ page = page - 1
+ skip = page_size * page
+
+ query: dict[str, object] = {
+ "crawl_id": crawl_id,
+ }
+ if org:
+ query["oid"] = org.id
+
+ if qa_run_id:
+ query[f"qa.{qa_run_id}"] = {"$exists": True}
+
+ range_filter = {}
+
+ if qa_gte:
+ range_filter["$gte"] = qa_gte
+ if qa_lte:
+ range_filter["$lte"] = qa_lte
+ if qa_gt:
+ range_filter["$gt"] = qa_gt
+ if qa_lt:
+ range_filter["$lt"] = qa_lt
+
+ if qa_filter_by:
+ if not range_filter:
+ raise HTTPException(status_code=400, detail="range_missing")
+
+ query[f"qa.{qa_run_id}.{qa_filter_by}"] = range_filter
+
+ aggregate = [{"$match": query}]
+
+ if sort_by:
+ # Sorting options to add:
+ # - automated heuristics like screenshot_comparison (dict keyed by QA run id)
+ # - Ensure notes sorting works okay with notes in list
+ sort_fields = ("url", "title", "notes", "approved")
+ qa_sort_fields = ("screenshotMatch", "textMatch")
+ if sort_by not in sort_fields and sort_by not in qa_sort_fields:
+ raise HTTPException(status_code=400, detail="invalid_sort_by")
+ if sort_direction not in (1, -1):
+ raise HTTPException(status_code=400, detail="invalid_sort_direction")
+
+ if sort_by in qa_sort_fields:
+ if not qa_run_id:
+ raise HTTPException(
+ status_code=400, detail="qa_run_id_missing_for_qa_sort"
+ )
+
+ sort_by = f"qa.{qa_run_id}.{sort_by}"
+
+ aggregate.extend([{"$sort": {sort_by: sort_direction}}])
+
+ if qa_run_id:
+ aggregate.extend([{"$set": {"qa": f"$qa.{qa_run_id}"}}])
+ # aggregate.extend([{"$project": {"qa": f"$qa.{qa_run_id}"}}])
+
+ aggregate.extend(
+ [
+ {
+ "$facet": {
+ "items": [
+ {"$skip": skip},
+ {"$limit": page_size},
+ ],
+ "total": [{"$count": "count"}],
+ }
+ },
+ ]
+ )
+
+ # Get total
+ cursor = self.pages.aggregate(aggregate)
+ results = await cursor.to_list(length=1)
+ result = results[0]
+ items = result["items"]
+
+ try:
+ total = int(result["total"][0]["count"])
+ except (IndexError, ValueError):
+ total = 0
+
+ if qa_run_id:
+ return [PageOutWithSingleQA.from_dict(data) for data in items], total
+
+ return [PageOut.from_dict(data) for data in items], total
+
+ async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
+ """Delete existing pages for crawl and re-add from WACZs."""
+ await self.delete_crawl_pages(crawl_id, oid)
+ print(f"Deleted pages for crawl {crawl_id}", flush=True)
+ await self.add_crawl_pages_to_db_from_wacz(crawl_id)
+
+ async def re_add_all_crawl_pages(self, oid: UUID):
+ """Re-add pages for all crawls in org"""
+ crawl_ids = await self.crawls.distinct(
+ "_id", {"type": "crawl", "finished": {"$ne": None}}
+ )
+ for crawl_id in crawl_ids:
+ await self.re_add_crawl_pages(crawl_id, oid)
+
+
+# ============================================================================
+# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
+def init_pages_api(app, mdb, crawl_ops, org_ops, storage_ops, user_dep):
+ """init pages API"""
+ # pylint: disable=invalid-name
+
+ ops = PageOps(mdb, crawl_ops, org_ops, storage_ops)
+
+ org_crawl_dep = org_ops.org_crawl_dep
+
+ @app.post("/orgs/{oid}/crawls/all/pages/reAdd", tags=["pages"])
+ async def re_add_all_crawl_pages(
+ org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep)
+ ):
+ """Re-add pages for all crawls in org (superuser only)"""
+ if not user.is_superuser:
+ raise HTTPException(status_code=403, detail="Not Allowed")
+
+ asyncio.create_task(ops.re_add_all_crawl_pages(org.id))
+ return {"started": True}
+
+ @app.post("/orgs/{oid}/crawls/{crawl_id}/pages/reAdd", tags=["pages"])
+ async def re_add_crawl_pages(
+ crawl_id: str, org: Organization = Depends(org_crawl_dep)
+ ):
+ """Re-add pages for crawl"""
+ asyncio.create_task(ops.re_add_crawl_pages(crawl_id, org.id))
+ return {"started": True}
+
+ @app.get(
+ "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}",
+ tags=["pages"],
+ response_model=Page,
+ )
+ async def get_page(
+ crawl_id: str,
+ page_id: UUID,
+ org: Organization = Depends(org_crawl_dep),
+ ):
+ """GET single page"""
+ return await ops.get_page(page_id, org.id, crawl_id)
+
+ @app.patch(
+ "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}",
+ tags=["pages"],
+ )
+ async def update_page_approval(
+ crawl_id: str,
+ page_id: UUID,
+ update: PageReviewUpdate,
+ org: Organization = Depends(org_crawl_dep),
+ user: User = Depends(user_dep),
+ ):
+ """Update review for specific page"""
+ return await ops.update_page_approval(
+ page_id, org.id, update.approved, crawl_id, user
+ )
+
+ @app.post(
+ "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}/notes",
+ tags=["pages"],
+ )
+ async def add_page_note(
+ crawl_id: str,
+ page_id: UUID,
+ note: PageNoteIn,
+ org: Organization = Depends(org_crawl_dep),
+ user: User = Depends(user_dep),
+ ):
+ """Add note to page"""
+ return await ops.add_page_note(page_id, org.id, note.text, user, crawl_id)
+
+ @app.patch(
+ "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}/notes",
+ tags=["pages"],
+ )
+ async def edit_page_note(
+ crawl_id: str,
+ page_id: UUID,
+ note: PageNoteEdit,
+ org: Organization = Depends(org_crawl_dep),
+ user: User = Depends(user_dep),
+ ):
+ """Edit page note"""
+ return await ops.update_page_note(page_id, org.id, note, user, crawl_id)
+
+ @app.post(
+ "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}/notes/delete",
+ tags=["pages"],
+ )
+ async def delete_page_notes(
+ crawl_id: str,
+ page_id: UUID,
+ delete: PageNoteDelete,
+ org: Organization = Depends(org_crawl_dep),
+ ):
+ """Edit page note"""
+ return await ops.delete_page_notes(page_id, org.id, delete, crawl_id)
+
+ @app.get(
+ "/orgs/{oid}/crawls/{crawl_id}/pages",
+ tags=["pages"],
+ response_model=PaginatedResponse,
+ )
+ async def get_pages_list(
+ crawl_id: str,
+ org: Organization = Depends(org_crawl_dep),
+ pageSize: int = DEFAULT_PAGE_SIZE,
+ page: int = 1,
+ sortBy: Optional[str] = None,
+ sortDirection: Optional[int] = -1,
+ ):
+ """Retrieve paginated list of pages"""
+ pages, total = await ops.list_pages(
+ crawl_id=crawl_id,
+ org=org,
+ page_size=pageSize,
+ page=page,
+ sort_by=sortBy,
+ sort_direction=sortDirection,
+ )
+ return paginated_format(pages, total, page, pageSize)
+
+ @app.get(
+ "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages",
+ tags=["pages", "qa"],
+ response_model=PaginatedResponse,
+ )
+ async def get_pages_list_with_qa(
+ crawl_id: str,
+ qa_run_id: str,
+ filterQABy: Optional[str] = None,
+ gte: Optional[float] = None,
+ gt: Optional[float] = None,
+ lte: Optional[float] = None,
+ lt: Optional[float] = None,
+ org: Organization = Depends(org_crawl_dep),
+ pageSize: int = DEFAULT_PAGE_SIZE,
+ page: int = 1,
+ sortBy: Optional[str] = None,
+ sortDirection: Optional[int] = -1,
+ ):
+ """Retrieve paginated list of pages"""
+ pages, total = await ops.list_pages(
+ crawl_id=crawl_id,
+ org=org,
+ qa_run_id=qa_run_id,
+ qa_filter_by=filterQABy,
+ qa_gte=gte,
+ qa_gt=gt,
+ qa_lte=lte,
+ qa_lt=lt,
+ page_size=pageSize,
+ page=page,
+ sort_by=sortBy,
+ sort_direction=sortDirection,
+ )
+ return paginated_format(pages, total, page, pageSize)
+
+ return ops
diff --git a/backend/btrixcloud/pagination.py b/backend/btrixcloud/pagination.py
index 4823613368..9b9e727060 100644
--- a/backend/btrixcloud/pagination.py
+++ b/backend/btrixcloud/pagination.py
@@ -1,4 +1,5 @@
"""API pagination"""
+
from typing import Any, List, Optional
diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py
index a15406940a..835aadffee 100644
--- a/backend/btrixcloud/storages.py
+++ b/backend/btrixcloud/storages.py
@@ -1,6 +1,7 @@
"""
Storage API
"""
+
from typing import (
Optional,
Iterator,
@@ -9,21 +10,24 @@
Dict,
AsyncIterator,
TYPE_CHECKING,
+ Any,
)
from urllib.parse import urlsplit
-from contextlib import asynccontextmanager, contextmanager
+from contextlib import asynccontextmanager
+from itertools import chain
import asyncio
import heapq
import zlib
import json
-import itertools
import os
from datetime import datetime
+from zipfile import ZipInfo
from fastapi import Depends, HTTPException
from stream_zip import stream_zip, NO_COMPRESSION_64
+from remotezip import RemoteZip
import aiobotocore.session
import boto3
@@ -41,10 +45,6 @@
S3StorageIn,
OrgStorageRefs,
)
-from .zip import (
- sync_get_zip_file,
- sync_get_log_stream,
-)
from .utils import is_bool, slug_from_name
@@ -72,12 +72,21 @@ class StorageOps:
org_ops: OrgOps
crawl_manager: CrawlManager
+ is_local_minio: bool
+ frontend_origin: str
+
def __init__(self, org_ops, crawl_manager) -> None:
self.org_ops = org_ops
self.crawl_manager = crawl_manager
self.is_local_minio = is_bool(os.environ.get("IS_LOCAL_MINIO"))
+ frontend_origin = os.environ.get(
+ "FRONTEND_ORIGIN", "http://browsertrix-cloud-frontend"
+ )
+ default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default")
+ self.frontend_origin = f"{frontend_origin}.{default_namespace}"
+
with open(os.environ["STORAGES_JSON"], encoding="utf-8") as fh:
storage_list = json.loads(fh.read())
@@ -275,8 +284,10 @@ async def get_s3_client(
) as client:
yield client, bucket, key
- @contextmanager
- def get_sync_client(self, org: Organization) -> Iterator[tuple[S3Client, str, str]]:
+ @asynccontextmanager
+ async def get_sync_client(
+ self, org: Organization
+ ) -> AsyncIterator[tuple[S3Client, str, str]]:
"""context manager for s3 client"""
storage = self.get_org_primary_storage(org)
@@ -312,6 +323,12 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None
resp = await client.put_object(Bucket=bucket, Key=key, Body=data)
assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200
+ def resolve_internal_access_path(self, path):
+ """Resolve relative path for internal access to minio bucket"""
+ if path.startswith("/"):
+ return self.frontend_origin + path
+ return path
+
def get_org_relative_path(
self, org: Organization, ref: StorageRef, file_path: str
) -> str:
@@ -497,7 +514,7 @@ async def _delete_file(
s3storage = self.get_org_storage_by_ref(org, storage)
- async with self.get_s3_client(s3storage, s3storage.use_access_for_presign) as (
+ async with self.get_s3_client(s3storage) as (
client,
bucket,
key,
@@ -508,55 +525,53 @@ async def _delete_file(
return status_code == 204
+ async def sync_stream_wacz_pages(
+ self, wacz_files: List[CrawlFileOut]
+ ) -> Iterator[Dict[Any, Any]]:
+ """Return stream of pages specified WACZ"""
+ loop = asyncio.get_event_loop()
+
+ resp = await loop.run_in_executor(None, self._sync_get_pages, wacz_files)
+
+ return resp
+
async def sync_stream_wacz_logs(
self,
- org: Organization,
- wacz_files: List[CrawlFile],
+ wacz_files: List[CrawlFileOut],
log_levels: List[str],
contexts: List[str],
) -> Iterator[bytes]:
"""Return filtered stream of logs from specified WACZs sorted by timestamp"""
- with self.get_sync_client(org) as (client, bucket, key):
- loop = asyncio.get_event_loop()
-
- resp = await loop.run_in_executor(
- None,
- self._sync_get_logs,
- wacz_files,
- log_levels,
- contexts,
- client,
- bucket,
- key,
- )
+ loop = asyncio.get_event_loop()
+
+ resp = await loop.run_in_executor(
+ None,
+ self._sync_get_logs,
+ wacz_files,
+ log_levels,
+ contexts,
+ )
- return resp
+ return resp
def _sync_get_logs(
self,
- wacz_files: List[CrawlFile],
+ wacz_files: List[CrawlFileOut],
log_levels: List[str],
contexts: List[str],
- client,
- bucket: str,
- key: str,
) -> Iterator[bytes]:
"""Generate filtered stream of logs from specified WACZs sorted by timestamp"""
# pylint: disable=too-many-function-args
def stream_log_lines(
- wacz_key, wacz_filename, cd_start, log_zipinfo
+ log_zipinfo: ZipInfo, wacz_url: str, wacz_filename: str
) -> Iterator[dict]:
"""Pass lines as json objects"""
+ filename = log_zipinfo.filename
- print(
- f"Fetching log {log_zipinfo.filename} from {wacz_filename}", flush=True
- )
-
- line_iter: Iterator[bytes] = sync_get_log_stream(
- client, bucket, wacz_key, log_zipinfo, cd_start
- )
+ print(f"Fetching log {filename} from {wacz_filename}", flush=True)
+ line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename)
for line in line_iter:
yield _parse_json(line.decode("utf-8", errors="ignore"))
@@ -573,14 +588,14 @@ def stream_json_lines(
yield json_str.encode("utf-8")
def organize_based_on_instance_number(
- wacz_files: List[CrawlFile],
- ) -> List[List[CrawlFile]]:
+ wacz_files: List[CrawlFileOut],
+ ) -> List[List[CrawlFileOut]]:
"""Place wacz_files into their own list based on instance number"""
- wacz_files.sort(key=lambda file: file.filename)
- waczs_groups: Dict[str, List[CrawlFile]] = {}
+ wacz_files.sort(key=lambda file: file.name)
+ waczs_groups: Dict[str, List[CrawlFileOut]] = {}
for file in wacz_files:
- instance_number = file.filename[
- file.filename.rfind("-") + 1 : file.filename.rfind(".")
+ instance_number = file.name[
+ file.name.rfind("-") + 1 : file.name.rfind(".")
]
if instance_number in waczs_groups:
waczs_groups[instance_number].append(file)
@@ -595,29 +610,73 @@ def organize_based_on_instance_number(
wacz_log_streams: List[Iterator[dict]] = []
for wacz_file in instance_list:
- wacz_key = key + wacz_file.filename
- cd_start, zip_file = sync_get_zip_file(client, bucket, wacz_key)
-
- log_files = [
- f
- for f in zip_file.filelist
- if f.filename.startswith("logs/") and not f.is_dir()
- ]
- log_files.sort(key=lambda log_zipinfo: log_zipinfo.filename)
-
- for log_zipinfo in log_files:
- wacz_log_streams.append(
- stream_log_lines(
- wacz_key, wacz_file.filename, cd_start, log_zipinfo
+ wacz_url = self.resolve_internal_access_path(wacz_file.path)
+ with RemoteZip(wacz_url) as remote_zip:
+ log_files: List[ZipInfo] = [
+ f
+ for f in remote_zip.infolist()
+ if f.filename.startswith("logs/") and not f.is_dir()
+ ]
+ log_files.sort(key=lambda log_zipinfo: log_zipinfo.filename)
+
+ for log_zipinfo in log_files:
+ wacz_log_streams.append(
+ stream_log_lines(log_zipinfo, wacz_url, wacz_file.name)
)
- )
- log_generators.append(itertools.chain(*wacz_log_streams))
+ log_generators.append(chain(*wacz_log_streams))
heap_iter = heapq.merge(*log_generators, key=lambda entry: entry["timestamp"])
return stream_json_lines(heap_iter, log_levels, contexts)
+ def _sync_get_pages(
+ self,
+ wacz_files: List[CrawlFileOut],
+ ) -> Iterator[Dict[Any, Any]]:
+ """Generate stream of page dicts from specified WACZs"""
+
+ # pylint: disable=too-many-function-args
+ def stream_page_lines(
+ pagefile_zipinfo: ZipInfo, wacz_url: str, wacz_filename: str
+ ) -> Iterator[Dict[Any, Any]]:
+ """Pass lines as json objects"""
+ filename = pagefile_zipinfo.filename
+
+ print(
+ f"Fetching JSON lines from {filename} in {wacz_filename}",
+ flush=True,
+ )
+
+ line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename)
+ for line in line_iter:
+ yield _parse_json(line.decode("utf-8", errors="ignore"))
+
+ page_generators: List[Iterator[Dict[Any, Any]]] = []
+
+ for wacz_file in wacz_files:
+ wacz_url = self.resolve_internal_access_path(wacz_file.path)
+ with RemoteZip(wacz_url) as remote_zip:
+ page_files: List[ZipInfo] = [
+ f
+ for f in remote_zip.infolist()
+ if f.filename.startswith("pages/")
+ and f.filename.endswith(".jsonl")
+ and not f.is_dir()
+ ]
+ for pagefile_zipinfo in page_files:
+ page_generators.append(
+ stream_page_lines(pagefile_zipinfo, wacz_url, wacz_file.name)
+ )
+
+ return chain.from_iterable(page_generators)
+
+ def _sync_get_filestream(self, wacz_url: str, filename: str) -> Iterator[bytes]:
+ """Return iterator of lines in remote file as bytes"""
+ with RemoteZip(wacz_url) as remote_zip:
+ with remote_zip.open(filename) as file_stream:
+ yield from file_stream
+
def _sync_dl(
self, all_files: List[CrawlFileOut], client: S3Client, bucket: str, key: str
) -> Iterator[bytes]:
@@ -664,7 +723,7 @@ async def download_streaming_wacz(
) -> Iterator[bytes]:
"""return an iter for downloading a stream nested wacz file
from list of files"""
- with self.get_sync_client(org) as (client, bucket, key):
+ async with self.get_sync_client(org) as (client, bucket, key):
loop = asyncio.get_event_loop()
resp = await loop.run_in_executor(
diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py
index 6f8f6474a0..2b3f6e2023 100644
--- a/backend/btrixcloud/uploads.py
+++ b/backend/btrixcloud/uploads.py
@@ -39,6 +39,15 @@
class UploadOps(BaseCrawlOps):
"""upload ops"""
+ async def get_upload(
+ self,
+ crawlid: str,
+ org: Optional[Organization] = None,
+ ) -> UploadedCrawl:
+ """Get crawl data for internal use"""
+ res = await self.get_crawl_raw(crawlid, org, "upload")
+ return UploadedCrawl.from_dict(res)
+
# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-function-args
# pylint: disable=too-many-arguments, too-many-locals, duplicate-code, invalid-name
async def upload_stream(
@@ -60,7 +69,7 @@ async def upload_stream(
prev_upload = None
if replaceId:
try:
- prev_upload = await self.get_crawl_raw(replaceId, org, "upload")
+ prev_upload = await self.get_upload(replaceId, org)
except HTTPException:
# not found
replaceId = None
@@ -371,7 +380,7 @@ async def list_uploads(
response_model=CrawlOut,
)
async def get_upload(crawlid: str, org: Organization = Depends(org_crawl_dep)):
- return await ops.get_crawl(crawlid, org, "upload")
+ return await ops.get_crawl_out(crawlid, org, "upload")
@app.get(
"/orgs/all/uploads/{crawl_id}/replay.json",
@@ -382,7 +391,7 @@ async def get_upload_replay_admin(crawl_id, user: User = Depends(user_dep)):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
- return await ops.get_crawl(crawl_id, None, "upload")
+ return await ops.get_crawl_out(crawl_id, None, "upload")
@app.get(
"/orgs/{oid}/uploads/{crawl_id}/replay.json",
@@ -390,7 +399,7 @@ async def get_upload_replay_admin(crawl_id, user: User = Depends(user_dep)):
response_model=CrawlOutWithResources,
)
async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)):
- return await ops.get_crawl(crawl_id, org, "upload")
+ return await ops.get_crawl_out(crawl_id, org, "upload")
@app.patch("/orgs/{oid}/uploads/{crawl_id}", tags=["uploads"])
async def update_uploads_api(
diff --git a/backend/btrixcloud/version.py b/backend/btrixcloud/version.py
index 158676b2b8..5375dd619a 100644
--- a/backend/btrixcloud/version.py
+++ b/backend/btrixcloud/version.py
@@ -1,2 +1,3 @@
""" current version """
-__version__ = "1.9.0-beta.2"
+
+__version__ = "1.10.0-beta.0"
diff --git a/backend/btrixcloud/webhooks.py b/backend/btrixcloud/webhooks.py
index aafe10e80c..5b3ec99243 100644
--- a/backend/btrixcloud/webhooks.py
+++ b/backend/btrixcloud/webhooks.py
@@ -195,12 +195,12 @@ async def _create_item_finished_notification(
body: Union[CrawlFinishedBody, UploadFinishedBody],
):
"""Create webhook notification for finished crawl/upload."""
- crawl = await self.crawl_ops.get_crawl(crawl_id, org)
+ crawl = await self.crawl_ops.get_crawl_out(crawl_id, org)
if not crawl:
print(f"Crawl {crawl_id} not found, skipping event webhook", flush=True)
return
- body.resources = crawl.resources
+ body.resources = crawl.resources or []
notification = WebhookNotification(
id=uuid4(),
diff --git a/backend/btrixcloud/zip.py b/backend/btrixcloud/zip.py
deleted file mode 100644
index e1c0d445ac..0000000000
--- a/backend/btrixcloud/zip.py
+++ /dev/null
@@ -1,198 +0,0 @@
-"""
-Methods for interacting with zip/WACZ files
-"""
-import io
-import struct
-import zipfile
-import zlib
-
-
-# ============================================================================
-EOCD_RECORD_SIZE = 22
-ZIP64_EOCD_RECORD_SIZE = 56
-ZIP64_EOCD_LOCATOR_SIZE = 20
-
-MAX_STANDARD_ZIP_SIZE = 4_294_967_295
-
-CHUNK_SIZE = 1024 * 256
-
-
-# ============================================================================
-def sync_get_log_stream(client, bucket, key, log_zipinfo, cd_start):
- """Return uncompressed byte stream of log file in WACZ"""
- # pylint: disable=too-many-locals
- file_head = sync_fetch(
- client, bucket, key, cd_start + log_zipinfo.header_offset + 26, 4
- )
- name_len = parse_little_endian_to_int(file_head[0:2])
- extra_len = parse_little_endian_to_int(file_head[2:4])
-
- content = sync_fetch_stream(
- client,
- bucket,
- key,
- cd_start + log_zipinfo.header_offset + 30 + name_len + extra_len,
- log_zipinfo.compress_size,
- )
-
- if log_zipinfo.compress_type == zipfile.ZIP_DEFLATED:
- uncompressed_content = zlib.decompressobj(-zlib.MAX_WBITS).decompress(content)
- else:
- uncompressed_content = content
-
- return sync_iter_lines(uncompressed_content)
-
-
-def sync_iter_lines(chunk_iter, keepends=True):
- """
- Iter by lines, adapted from botocore
- """
- pending = b""
- for chunk in chunk_iter:
- lines = (pending + chunk).splitlines(True)
- for line in lines[:-1]:
- yield line.splitlines(keepends)[0]
- pending = lines[-1]
- if pending:
- yield pending.splitlines(keepends)[0]
-
-
-async def get_zip_file(client, bucket, key):
- """Fetch enough of the WACZ file be able to read the zip filelist"""
- file_size = await get_file_size(client, bucket, key)
- eocd_record = await fetch(
- client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE
- )
-
- if file_size <= MAX_STANDARD_ZIP_SIZE:
- cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
- central_directory = await fetch(client, bucket, key, cd_start, cd_size)
- return (
- cd_start,
- zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)),
- )
-
- zip64_eocd_record = await fetch(
- client,
- bucket,
- key,
- file_size
- - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
- ZIP64_EOCD_RECORD_SIZE,
- )
- zip64_eocd_locator = await fetch(
- client,
- bucket,
- key,
- file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
- ZIP64_EOCD_LOCATOR_SIZE,
- )
- cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
- central_directory = await fetch(client, bucket, key, cd_start, cd_size)
- return (
- cd_start,
- zipfile.ZipFile(
- io.BytesIO(
- central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record
- )
- ),
- )
-
-
-def sync_get_zip_file(client, bucket, key):
- """Fetch enough of the WACZ file be able to read the zip filelist"""
- file_size = sync_get_file_size(client, bucket, key)
- eocd_record = sync_fetch(
- client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE
- )
-
- if file_size <= MAX_STANDARD_ZIP_SIZE:
- cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
- central_directory = sync_fetch(client, bucket, key, cd_start, cd_size)
- with zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)) as zip_file:
- return (cd_start, zip_file)
-
- zip64_eocd_record = sync_fetch(
- client,
- bucket,
- key,
- file_size
- - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
- ZIP64_EOCD_RECORD_SIZE,
- )
- zip64_eocd_locator = sync_fetch(
- client,
- bucket,
- key,
- file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
- ZIP64_EOCD_LOCATOR_SIZE,
- )
- cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
- central_directory = sync_fetch(client, bucket, key, cd_start, cd_size)
- with zipfile.ZipFile(
- io.BytesIO(
- central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record
- )
- ) as zip_file:
- return (cd_start, zip_file)
-
-
-async def get_file_size(client, bucket, key):
- """Get WACZ file size from HEAD request"""
- head_response = await client.head_object(Bucket=bucket, Key=key)
- return head_response["ContentLength"]
-
-
-def sync_get_file_size(client, bucket, key):
- """Get WACZ file size from HEAD request"""
- head_response = client.head_object(Bucket=bucket, Key=key)
- return head_response["ContentLength"]
-
-
-async def fetch(client, bucket, key, start, length):
- """Fetch a byte range from a file in object storage"""
- end = start + length - 1
- response = await client.get_object(
- Bucket=bucket, Key=key, Range=f"bytes={start}-{end}"
- )
- return await response["Body"].read()
-
-
-def sync_fetch(client, bucket, key, start, length):
- """Fetch a byte range from a file in object storage"""
- end = start + length - 1
- response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}")
- return response["Body"].read()
-
-
-def sync_fetch_stream(client, bucket, key, start, length):
- """Fetch a byte range from a file in object storage as a stream"""
- end = start + length - 1
- response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}")
- return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)
-
-
-def get_central_directory_metadata_from_eocd(eocd):
- """Get central directory start and size"""
- cd_size = parse_little_endian_to_int(eocd[12:16])
- cd_start = parse_little_endian_to_int(eocd[16:20])
- return cd_start, cd_size
-
-
-def get_central_directory_metadata_from_eocd64(eocd64):
- """Get central directory start and size for zip64"""
- cd_size = parse_little_endian_to_int(eocd64[40:48])
- cd_start = parse_little_endian_to_int(eocd64[48:56])
- return cd_start, cd_size
-
-
-def parse_little_endian_to_int(little_endian_bytes):
- """Convert little endian used in zip spec to int"""
- byte_length = len(little_endian_bytes)
- format_character = "q"
- if byte_length == 4:
- format_character = "i"
- elif byte_length == 2:
- format_character = "h"
-
- return struct.unpack("<" + format_character, little_endian_bytes)[0]
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 0877636c95..078472546e 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -9,7 +9,7 @@ email-validator
#fastapi-users[mongodb]==9.2.2
loguru
aiofiles
-kubernetes-asyncio==25.11.0
+kubernetes-asyncio==29.0.0
kubernetes
aiobotocore
redis>=5.0.0
@@ -28,3 +28,4 @@ types_aiobotocore_s3
types-redis
types-python-slugify
types-pyYAML
+remotezip
diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py
new file mode 100644
index 0000000000..6029541843
--- /dev/null
+++ b/backend/test/test_qa.py
@@ -0,0 +1,187 @@
+from .conftest import API_PREFIX, HOST_PREFIX
+import requests
+import time
+from datetime import datetime
+
+qa_run_id = None
+
+
+def test_run_qa(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ r = requests.post(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start",
+ headers=crawler_auth_headers,
+ )
+
+ assert r.status_code == 200
+
+ data = r.json()
+ assert data["started"]
+ global qa_run_id
+ qa_run_id = data["started"]
+
+
+def test_run_qa_already_running(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ r = requests.post(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start",
+ headers=crawler_auth_headers,
+ )
+
+ assert r.status_code == 400
+ assert r.json()["detail"] == "qa_already_running"
+
+
+def test_active_qa(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA",
+ headers=crawler_auth_headers,
+ )
+
+ data = r.json()
+ qa = data["qa"]
+
+ assert qa
+ assert qa["state"]
+ assert qa["started"]
+ assert not qa["finished"]
+
+
+def test_qa_list(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
+ headers=crawler_auth_headers,
+ )
+
+ data = r.json()
+
+ assert len(data) == 1
+
+ qa = data[0]
+ assert qa
+ assert qa["state"]
+ assert qa["started"]
+ assert not qa["finished"]
+
+
+def test_wait_for_complete(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ count = 0
+ completed = False
+ while count < 24:
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA",
+ headers=crawler_auth_headers,
+ )
+
+ data = r.json()
+ if not data["qa"]:
+ completed = True
+ break
+
+ time.sleep(5)
+ count += 1
+
+ assert completed
+
+
+def test_qa_completed(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
+ headers=crawler_auth_headers,
+ )
+
+ data = r.json()
+
+ assert len(data) == 1
+
+ qa = data[0]
+ assert qa
+ assert qa["state"] == "complete"
+ assert qa["started"]
+ assert qa["finished"]
+ assert qa["stats"]["found"] == 1
+ assert qa["stats"]["done"] == 1
+ assert qa["crawlExecSeconds"] > 0
+
+
+def test_qa_org_stats(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
+ headers=crawler_auth_headers,
+ )
+ crawl_stats = r.json()
+ assert crawl_stats["qaCrawlExecSeconds"] > 0
+
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}",
+ headers=crawler_auth_headers,
+ )
+ org_stats = r.json()
+
+ yymm = datetime.utcnow().strftime("%Y-%m")
+ assert org_stats["qaCrawlExecSeconds"][yymm] > 0
+ assert org_stats["qaUsage"][yymm] > 0
+
+
+def test_qa_page_data(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages",
+ headers=crawler_auth_headers,
+ )
+ data = r.json()
+ assert len(data["items"]) == 1
+ page = data["items"][0]
+ assert page["title"] == "Webrecorder"
+ assert page["url"] == "https://webrecorder.net/"
+ assert page["qa"]["textMatch"] == 1.0
+ assert page["qa"]["screenshotMatch"] == 1.0
+ assert page["qa"]["resourceCounts"] == {
+ "crawlGood": 15,
+ "crawlBad": 0,
+ "replayGood": 15,
+ "replayBad": 1,
+ }
+
+
+def test_qa_replay(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/replay.json",
+ headers=crawler_auth_headers,
+ )
+ data = r.json()
+ assert len(data["resources"]) == 1
+ assert data["resources"][0]["path"]
+
+
+def test_run_qa_not_running(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ r = requests.post(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/stop",
+ headers=crawler_auth_headers,
+ )
+
+ assert r.status_code == 400
+ assert r.json()["detail"] == "qa_not_running"
+
+
+def test_delete_qa_run(crawler_crawl_id, crawler_auth_headers, default_org_id):
+ r = requests.post(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/delete",
+ json={"qa_run_ids": [qa_run_id]},
+ headers=crawler_auth_headers,
+ )
+
+ assert r.status_code == 200
+ assert r.json()["deleted"] == True
+
+ # deleted from finished qa list
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
+ headers=crawler_auth_headers,
+ )
+
+ assert len(r.json()) == 0
+
+ # deleted from pages
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages",
+ headers=crawler_auth_headers,
+ )
+ assert len(r.json()["items"]) == 0
diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py
index ab42a4c1a9..59720c40dc 100644
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@@ -16,6 +16,8 @@
wacz_content = None
+page_id = None
+
def test_list_orgs(admin_auth_headers, default_org_id):
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
@@ -280,6 +282,43 @@ def test_update_crawl(
assert data["description"] == UPDATED_DESC
assert data["name"] == UPDATED_NAME
assert data["collectionIds"] == UPDATED_COLLECTION_IDS
+ assert data.get("reviewStatus") is None
+
+ # Update reviewStatus and verify
+ r = requests.patch(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
+ headers=admin_auth_headers,
+ json={
+ "reviewStatus": "good",
+ },
+ )
+ assert r.status_code == 200
+ data = r.json()
+ assert data["updated"]
+
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
+ headers=admin_auth_headers,
+ )
+ assert r.status_code == 200
+ assert r.json()["reviewStatus"] == "good"
+
+ # Try to update to invalid reviewStatus
+ r = requests.patch(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
+ headers=admin_auth_headers,
+ json={
+ "reviewStatus": "invalid",
+ },
+ )
+ assert r.status_code == 422
+
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
+ headers=admin_auth_headers,
+ )
+ assert r.status_code == 200
+ assert r.json()["reviewStatus"] == "good"
# Verify deleting works as well
r = requests.patch(
@@ -374,6 +413,213 @@ def test_crawl_stats(crawler_auth_headers, default_org_id):
assert row["avg_page_time"] or row["avg_page_time"] == 0
+def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
+ # Test GET list endpoint
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 200
+ data = r.json()
+ assert data["total"] >= 0
+
+ pages = data["items"]
+ assert pages
+
+ for page in pages:
+ assert page["id"]
+ assert page["oid"]
+ assert page["crawl_id"]
+ assert page["url"]
+ assert page["ts"]
+ assert page.get("title") or page.get("title") is None
+ assert page["loadState"]
+ assert page["status"]
+
+ # Test GET page endpoint
+ global page_id
+ page_id = pages[0]["id"]
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 200
+ page = r.json()
+
+ assert page["id"] == page_id
+ assert page["oid"]
+ assert page["crawl_id"]
+ assert page["url"]
+ assert page["ts"]
+ assert page.get("title") or page.get("title") is None
+ assert page["loadState"]
+
+ assert page["notes"] == []
+ assert page.get("userid") is None
+ assert page.get("modified") is None
+ assert page.get("approved") is None
+
+ # Update page with approval
+ r = requests.patch(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
+ headers=crawler_auth_headers,
+ json={
+ "approved": True,
+ },
+ )
+ assert r.status_code == 200
+ assert r.json()["updated"]
+
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 200
+ page = r.json()
+
+ assert page["id"] == page_id
+ assert page["oid"]
+ assert page["crawl_id"]
+ assert page["url"]
+ assert page["ts"]
+ assert page.get("title") or page.get("title") is None
+ assert page["loadState"]
+
+ assert page["notes"] == []
+ assert page["userid"]
+ assert page["modified"]
+ assert page["approved"]
+
+
+def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
+ # Re-add pages and verify they were correctly added
+ r = requests.post(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 200
+ assert r.json()["started"]
+
+ time.sleep(10)
+
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 200
+ data = r.json()
+ assert data["total"] >= 0
+
+ pages = data["items"]
+ assert pages
+
+ for page in pages:
+ assert page["id"]
+ assert page["oid"]
+ assert page["crawl_id"]
+ assert page["url"]
+ assert page["ts"]
+ assert page.get("title") or page.get("title") is None
+ assert page["loadState"]
+ assert page["status"]
+
+ # Ensure only superuser can re-add pages for all crawls in an org
+ r = requests.post(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/all/pages/reAdd",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 403
+
+
+def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):
+ note_text = "testing"
+ updated_note_text = "updated"
+ untouched_text = "untouched"
+
+ # Add note
+ r = requests.post(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes",
+ headers=crawler_auth_headers,
+ json={"text": note_text},
+ )
+ assert r.status_code == 200
+ assert r.json()["added"]
+
+ # Check that note was added
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 200
+ data = r.json()
+
+ assert len(data["notes"]) == 1
+
+ first_note = data["notes"][0]
+
+ first_note_id = first_note["id"]
+ assert first_note_id
+
+ assert first_note["created"]
+ assert first_note["userid"]
+ assert first_note["userName"]
+ assert first_note["text"] == note_text
+
+ # Add second note to test selective updates/deletes
+ r = requests.post(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes",
+ headers=crawler_auth_headers,
+ json={"text": untouched_text},
+ )
+ assert r.status_code == 200
+ assert r.json()["added"]
+
+ # Edit first note
+ r = requests.patch(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes",
+ headers=crawler_auth_headers,
+ json={"text": updated_note_text, "id": first_note_id},
+ )
+ assert r.status_code == 200
+ assert r.json()["updated"]
+
+ # Verify notes look as expected
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 200
+ data = r.json()
+ notes = data["notes"]
+
+ assert len(notes) == 2
+
+ updated_note = [note for note in notes if note["id"] == first_note_id][0]
+ assert updated_note["text"] == updated_note_text
+
+ second_note_id = [note["id"] for note in notes if note["text"] == untouched_text][0]
+ assert second_note_id
+
+ # Delete both notes
+ r = requests.post(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes/delete",
+ headers=crawler_auth_headers,
+ json={"delete_list": [first_note_id, second_note_id]},
+ )
+ assert r.status_code == 200
+ assert r.json()["deleted"]
+
+ # Verify notes were deleted
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 200
+ data = r.json()
+ notes = data.get("notes")
+ assert notes == []
+
+
def test_delete_crawls_crawler(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
):
@@ -387,6 +633,14 @@ def test_delete_crawls_crawler(
data = r.json()
assert data["detail"] == "not_allowed"
+ # Check that pages exist for crawl
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 200
+ assert r.json()["total"] > 0
+
# Test that crawler user can delete own crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
@@ -398,6 +652,8 @@ def test_delete_crawls_crawler(
assert data["deleted"] == 1
assert data["storageQuotaReached"] is False
+ time.sleep(5)
+
# Test that crawl is not found after deleting
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
@@ -405,6 +661,14 @@ def test_delete_crawls_crawler(
)
assert r.status_code == 404
+ # Test that associated pages are also deleted
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
+ headers=crawler_auth_headers,
+ )
+ assert r.status_code == 200
+ assert r.json()["total"] == 0
+
def test_delete_crawls_org_owner(
admin_auth_headers,
diff --git a/backend/test/test_settings.py b/backend/test/test_settings.py
index 1f4515ac43..a15f012ee5 100644
--- a/backend/test/test_settings.py
+++ b/backend/test/test_settings.py
@@ -14,5 +14,6 @@ def test_settings():
"jwtTokenLifetime": 86400,
"defaultBehaviorTimeSeconds": 300,
"maxPagesPerCrawl": 4,
+ "maxScale": 3,
"defaultPageLoadTimeSeconds": 120,
}
diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py
index 15c477d073..e4249e9b04 100644
--- a/backend/test/test_uploads.py
+++ b/backend/test/test_uploads.py
@@ -1,5 +1,6 @@
import requests
import os
+import time
from urllib.parse import urljoin
from .conftest import API_PREFIX
@@ -934,6 +935,8 @@ def test_delete_form_upload_and_crawls_from_all_crawls(
assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size
assert data["storageUsedUploads"] == org_upload_bytes - upload_size
+ time.sleep(10)
+
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{all_crawls_delete_config_id}",
headers=admin_auth_headers,
diff --git a/backend/test/test_utils.py b/backend/test/test_utils.py
index a43b3cb7f3..4c523f78f9 100644
--- a/backend/test/test_utils.py
+++ b/backend/test/test_utils.py
@@ -1,4 +1,5 @@
"""utils tests"""
+
import pytest
from btrixcloud.utils import slug_from_name
diff --git a/chart/Chart.yaml b/chart/Chart.yaml
index 4a994666e7..920172b154 100644
--- a/chart/Chart.yaml
+++ b/chart/Chart.yaml
@@ -5,7 +5,7 @@ type: application
icon: https://webrecorder.net/assets/icon.png
# Browsertrix Cloud and Chart Version
-version: v1.9.0-beta.2
+version: v1.10.0-beta.0
dependencies:
- name: btrix-admin-logging
diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml
index 84fad5ef92..3255e56f99 100644
--- a/chart/app-templates/crawl_job.yaml
+++ b/chart/app-templates/crawl_job.yaml
@@ -4,7 +4,7 @@ metadata:
name: crawljob-{{ id }}
labels:
crawl: "{{ id }}"
- role: "job"
+ role: {{ "qa-job" if qa_source else "job" }}
btrix.org: "{{ oid }}"
btrix.user: "{{ userid }}"
btrix.storage: "{{ storage_name }}"
@@ -19,11 +19,15 @@ spec:
cid: "{{ cid }}"
oid: "{{ oid }}"
scale: {{ scale }}
- maxCrawlSize: {{ max_crawl_size }}
- timeout: {{ timeout }}
+
+ maxCrawlSize: {{ max_crawl_size if not qa_source else 0 }}
+ timeout: {{ timeout if not qa_source else 0 }}
+ qaSourceCrawlId: "{{ qa_source }}"
+
manual: {{ manual }}
crawlerChannel: "{{ crawler_channel }}"
- ttlSecondsAfterFinished: 30
+ ttlSecondsAfterFinished: {{ 30 if not qa_source else 0 }}
+ warcPrefix: "{{ warc_prefix }}"
storageName: "{{ storage_name }}"
diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml
index 67d8c58853..e9ea1834d0 100644
--- a/chart/app-templates/crawler.yaml
+++ b/chart/app-templates/crawler.yaml
@@ -53,8 +53,11 @@ spec:
volumes:
- name: crawl-config
configMap:
+ {% if not qa_source_crawl_id %}
name: crawl-config-{{ cid }}
-
+ {% else %}
+ name: qa-replay-{{ qa_source_crawl_id }}
+ {% endif %}
- name: crawl-data
persistentVolumeClaim:
claimName: {{ name }}
@@ -102,6 +105,7 @@ spec:
image: {{ crawler_image }}
imagePullPolicy: {{ crawler_image_pull_policy }}
command:
+ {% if not qa_source_crawl_id %}
- crawl
- --config
- /tmp/crawl-config.json
@@ -112,6 +116,14 @@ spec:
- "@{{ profile_filename }}"
{%- endif %}
+ {% else %}
+ - qa
+ - --qaSource
+ - /tmp/crawl-config.json
+ - --redisStoreUrl
+ - {{ redis_url }}
+ - --writePagesToRedis
+ {% endif %}
volumeMounts:
- name: crawl-config
mountPath: /tmp/crawl-config.json
@@ -149,6 +161,9 @@ spec:
- name: STORE_USER
value: "{{ userid }}"
+ - name: WARC_PREFIX
+ value: "{{ warc_prefix }}"
+
{% if crawler_socks_proxy_host %}
- name: SOCKS_HOST
value: "{{ crawler_socks_proxy_host }}"
diff --git a/chart/app-templates/profilebrowser.yaml b/chart/app-templates/profilebrowser.yaml
index 335f705c7d..7c1dab8884 100644
--- a/chart/app-templates/profilebrowser.yaml
+++ b/chart/app-templates/profilebrowser.yaml
@@ -78,8 +78,8 @@ spec:
resources:
limits:
- memory: "{{ crawler_memory }}"
+ memory: "{{ profile_memory }}"
requests:
- cpu: "{{ crawler_cpu }}"
- memory: "{{ crawler_memory }}"
+ cpu: "{{ profile_cpu }}"
+ memory: "{{ profile_memory }}"
diff --git a/chart/app-templates/qa_configmap.yaml b/chart/app-templates/qa_configmap.yaml
new file mode 100644
index 0000000000..9fd9e4051b
--- /dev/null
+++ b/chart/app-templates/qa_configmap.yaml
@@ -0,0 +1,14 @@
+# -------
+# CONFIGMAP
+# -------
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: {{ name }}
+ namespace: {{ namespace }}
+ labels:
+ crawl: {{ id }}
+ role: crawler
+
+data:
+ crawl-config.json: {{ qa_source_replay_json | tojson }}
diff --git a/chart/app-templates/replica_job.yaml b/chart/app-templates/replica_job.yaml
index 30870d3fb9..88a7da17b8 100644
--- a/chart/app-templates/replica_job.yaml
+++ b/chart/app-templates/replica_job.yaml
@@ -13,7 +13,7 @@ spec:
template:
spec:
restartPolicy: Never
- priorityClassName: bg-jobs
+ priorityClassName: bg-job
podFailurePolicy:
rules:
- action: FailJob
diff --git a/chart/email-templates/invite b/chart/email-templates/invite
index 61a2469d3b..72a8837d85 100644
--- a/chart/email-templates/invite
+++ b/chart/email-templates/invite
@@ -1,13 +1,13 @@
-Welcome to Browsertrix Cloud!
+Welcome to Browsertrix!
~~~
Hello!
-
Welcome to Browsertrix Cloud!
+
Welcome to Browsertrix!
{% if sender %}
-
You have been invited by {{ sender }} to join "{{ org_name }}" on Browsertrix Cloud!
+
You have been invited by {{ sender }} to join "{{ org_name }}" on Browsertrix!
When you first access your account, you’ll be directed to your Dashboard. It contains information you may want to view frequently including: Storage Usage, Crawling Info, Collections, and Monthly Usage History. From there, you can click + Create New to create your first Crawl Workflow!
-
${msg(
- "Tip: To generate very strong passwords, consider using a password manager."
+ "Tip: To generate very strong passwords, consider using a password manager.",
)}
- `
+ `,
)}
diff --git a/frontend/src/components/ui/relative-duration.ts b/frontend/src/components/ui/relative-duration.ts
index a6df8bfce4..609b6547ed 100644
--- a/frontend/src/components/ui/relative-duration.ts
+++ b/frontend/src/components/ui/relative-duration.ts
@@ -1,6 +1,6 @@
-import { LitElement } from "lit";
-import { property, state, customElement } from "lit/decorators.js";
import { localized } from "@lit/localize";
+import { LitElement } from "lit";
+import { customElement, property, state } from "lit/decorators.js";
import humanizeDuration from "pretty-ms";
export type HumanizeOptions = {
@@ -84,7 +84,7 @@ export class RelativeDuration extends LitElement {
compact: this.compact,
verbose: this.verbose,
unitCount: this.unitCount,
- }
+ },
);
}
}
diff --git a/frontend/src/components/ui/search-combobox.ts b/frontend/src/components/ui/search-combobox.ts
index eec2d5e612..b4cc616c95 100644
--- a/frontend/src/components/ui/search-combobox.ts
+++ b/frontend/src/components/ui/search-combobox.ts
@@ -1,15 +1,18 @@
-import { LitElement, html, nothing } from "lit";
-import { property, state, query, customElement } from "lit/decorators.js";
-import { msg, localized } from "@lit/localize";
+import { localized, msg } from "@lit/localize";
+import type { SlInput, SlMenuItem } from "@shoelace-style/shoelace";
+import Fuse from "fuse.js";
+import { html, LitElement, nothing, type PropertyValues } from "lit";
+import { customElement, property, query, state } from "lit/decorators.js";
import { when } from "lit/directives/when.js";
import debounce from "lodash/fp/debounce";
-import Fuse from "fuse.js";
-import type { SlInput, SlMenuItem } from "@shoelace-style/shoelace";
-export type SelectEvent = CustomEvent<{
+import { type UnderlyingFunction } from "@/types/utils";
+
+type SelectEventDetail = {
key: string | null;
value?: T;
-}>;
+};
+export type SelectEvent = CustomEvent>;
const MIN_SEARCH_LENGTH = 2;
const MAX_SEARCH_RESULTS = 10;
@@ -39,7 +42,7 @@ export class SearchCombobox extends LitElement {
placeholder: string = msg("Start typing to search");
@state()
- private searchByValue: string = "";
+ private searchByValue = "";
private get hasSearchStr() {
return this.searchByValue.length >= MIN_SEARCH_LENGTH;
@@ -49,7 +52,7 @@ export class SearchCombobox extends LitElement {
private searchResultsOpen = false;
@query("sl-input")
- private input!: SlInput;
+ private readonly input!: SlInput;
private fuse = new Fuse([], {
keys: [],
@@ -63,15 +66,19 @@ export class SearchCombobox extends LitElement {
super.disconnectedCallback();
}
- protected willUpdate(changedProperties: Map) {
+ protected willUpdate(changedProperties: PropertyValues) {
if (changedProperties.get("selectedKey") && !this.selectedKey) {
this.onSearchInput.cancel();
this.searchByValue = "";
}
- if (changedProperties.has("searchKeys") && this.searchKeys) {
+ if (changedProperties.has("searchKeys")) {
this.onSearchInput.cancel();
this.fuse = new Fuse([], {
- ...(this.fuse as any).options,
+ ...(
+ this.fuse as unknown as {
+ options: ConstructorParameters[1];
+ }
+ ).options,
keys: this.searchKeys,
});
}
@@ -99,12 +106,12 @@ export class SearchCombobox extends LitElement {
this.searchByValue = item.value;
await this.updateComplete;
this.dispatchEvent(
- >new CustomEvent("btrix-select", {
+ new CustomEvent>("btrix-select", {
detail: {
- key: key,
- value: item.value,
+ key: key ?? null,
+ value: item.value as T,
},
- })
+ }),
);
}}
>
@@ -118,19 +125,21 @@ export class SearchCombobox extends LitElement {
this.onSearchInput.cancel();
this.dispatchEvent(new CustomEvent("btrix-clear"));
}}
- @sl-input=${this.onSearchInput as () => void}
+ @sl-input=${this.onSearchInput as UnderlyingFunction<
+ typeof this.onSearchInput
+ >}
>
${when(
- this.selectedKey && this.keyLabels?.[this.selectedKey as string],
+ this.selectedKey && this.keyLabels?.[this.selectedKey],
() =>
html`${this.keyLabels![this.selectedKey as string]}${this.keyLabels![this.selectedKey!]}`,
- () => html``
+ () => html``,
)}
${this.renderSearchResults()}
@@ -175,15 +184,15 @@ export class SearchCombobox extends LitElement {
`;
}
return nothing;
- })
+ }),
)}
`;
}
- private onSearchInput = debounce(150)(() => {
- this.searchByValue = this.input.value?.trim();
+ private readonly onSearchInput = debounce(150)(() => {
+ this.searchByValue = this.input.value.trim();
- if (this.searchResultsOpen === false && this.hasSearchStr) {
+ if (!this.searchResultsOpen && this.hasSearchStr) {
this.searchResultsOpen = true;
}
diff --git a/frontend/src/components/ui/section-heading.ts b/frontend/src/components/ui/section-heading.ts
index 110698dd9d..ffb4808150 100644
--- a/frontend/src/components/ui/section-heading.ts
+++ b/frontend/src/components/ui/section-heading.ts
@@ -1,4 +1,4 @@
-import { LitElement, html, css } from "lit";
+import { css, html, LitElement } from "lit";
import { customElement } from "lit/decorators.js";
/**
diff --git a/frontend/src/components/ui/select-crawler.ts b/frontend/src/components/ui/select-crawler.ts
index b4630c64de..484963d779 100644
--- a/frontend/src/components/ui/select-crawler.ts
+++ b/frontend/src/components/ui/select-crawler.ts
@@ -1,12 +1,24 @@
+import { localized, msg } from "@lit/localize";
+import { type SlSelect } from "@shoelace-style/shoelace";
import { html } from "lit";
-import { property, state, customElement } from "lit/decorators.js";
-import { msg, localized } from "@lit/localize";
-
-import type { AuthState } from "../../utils/AuthService";
-import type { CrawlerChannel } from "../../pages/org/types";
+import { customElement, property, state } from "lit/decorators.js";
+import capitalize from "lodash/fp/capitalize";
+import type { CrawlerChannel } from "@/pages/org/types";
+import type { AuthState } from "@/utils/AuthService";
import LiteElement from "@/utils/LiteElement";
-import capitalize from "lodash/fp/capitalize";
+
+type SelectCrawlerChangeDetail = {
+ value: string | undefined;
+};
+
+export type SelectCrawlerChangeEvent = CustomEvent;
+
+type SelectCrawlerUpdateDetail = {
+ show: boolean;
+};
+
+export type SelectCrawlerUpdateEvent = CustomEvent;
type CrawlerChannelsAPIResponse = {
channels: CrawlerChannel[];
@@ -45,7 +57,7 @@ export class SelectCrawler extends LiteElement {
private crawlerChannels?: CrawlerChannel[];
protected firstUpdated() {
- this.fetchCrawlerChannels();
+ void this.fetchCrawlerChannels();
}
render() {
@@ -63,15 +75,16 @@ export class SelectCrawler extends LiteElement {
@sl-change=${this.onChange}
@sl-focus=${() => {
// Refetch to keep list up to date
- this.fetchCrawlerChannels();
+ void this.fetchCrawlerChannels();
}}
@sl-hide=${this.stopProp}
@sl-after-hide=${this.stopProp}
>
${this.crawlerChannels?.map(
- (crawler) => html`
- ${capitalize(crawler.id)}
- `
+ (crawler) =>
+ html`
+ ${capitalize(crawler.id)}
+ `,
)}
${this.selectedCrawler
? html`
@@ -87,19 +100,19 @@ export class SelectCrawler extends LiteElement {
`;
}
- private onChange(e: any) {
+ private onChange(e: Event) {
this.stopProp(e);
this.selectedCrawler = this.crawlerChannels?.find(
- ({ id }) => id === e.target.value
+ ({ id }) => id === (e.target as SlSelect).value,
);
this.dispatchEvent(
- new CustomEvent("on-change", {
+ new CustomEvent("on-change", {
detail: {
value: this.selectedCrawler?.id,
},
- })
+ }),
);
}
@@ -109,11 +122,11 @@ export class SelectCrawler extends LiteElement {
private async fetchCrawlerChannels(): Promise {
try {
const channels = await this.getCrawlerChannels();
- this.crawlerChannels = channels as CrawlerChannel[];
+ this.crawlerChannels = channels;
if (this.crawlerChannel && !this.selectedCrawler) {
this.selectedCrawler = this.crawlerChannels.find(
- ({ id }) => id === this.crawlerChannel
+ ({ id }) => id === this.crawlerChannel,
);
}
@@ -124,19 +137,19 @@ export class SelectCrawler extends LiteElement {
detail: {
value: "default",
},
- })
+ }),
);
this.selectedCrawler = this.crawlerChannels.find(
- ({ id }) => id === this.crawlerChannel
+ ({ id }) => id === this.crawlerChannel,
);
}
this.dispatchEvent(
- new CustomEvent("on-update", {
+ new CustomEvent("on-update", {
detail: {
show: this.crawlerChannels.length > 1,
},
- })
+ }),
);
} catch (e) {
this.notify({
@@ -151,7 +164,7 @@ export class SelectCrawler extends LiteElement {
const data: CrawlerChannelsAPIResponse =
await this.apiFetch(
`/orgs/${this.orgId}/crawlconfigs/crawler-channels`,
- this.authState!
+ this.authState!,
);
return data.channels;
@@ -162,7 +175,7 @@ export class SelectCrawler extends LiteElement {
* Prevents bug where sl-dialog closes when dropdown closes
* https://github.com/shoelace-style/shoelace/issues/170
*/
- private stopProp(e: CustomEvent) {
+ private stopProp(e: Event) {
e.stopPropagation();
}
}
diff --git a/frontend/src/components/ui/tab-list.ts b/frontend/src/components/ui/tab-list.ts
index 220a8fe531..6ab9e435c7 100644
--- a/frontend/src/components/ui/tab-list.ts
+++ b/frontend/src/components/ui/tab-list.ts
@@ -1,11 +1,12 @@
-import { TailwindElement } from "@/classes/TailwindElement";
-import { LitElement, html, css } from "lit";
-import { property, queryAsync, customElement } from "lit/decorators.js";
+import { css, html, LitElement, type PropertyValues } from "lit";
+import { customElement, property, queryAsync } from "lit/decorators.js";
import { ifDefined } from "lit/directives/if-defined.js";
+import { TailwindElement } from "@/classes/TailwindElement";
+
const DEFAULT_PANEL_ID = "default-panel";
-// Breakpoint in pixels for 2-column layout
-const TWO_COL_SCREEN_MIN = 1032;
+// postcss-lit-disable-next-line
+export const TWO_COL_SCREEN_MIN_CSS = css`64.5rem`;
/**
* Tab list
@@ -59,7 +60,7 @@ export class Tab extends TailwindElement {
render() {
return html`