diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml index 13d9643090..169a6fb7bc 100644 --- a/.github/ISSUE_TEMPLATE/bug.yml +++ b/.github/ISSUE_TEMPLATE/bug.yml @@ -1,5 +1,5 @@ name: Bug Report -description: If something isn't working as expected and you're sure your issue is reproducible, please file this type of issue! +description: Report a demonstrable problem caused by code in this repo. title: "[Bug]: " labels: ["bug"] body: @@ -7,8 +7,8 @@ body: - type: input id: version attributes: - label: Browsertrix Cloud Version - description: This can be found in the site footer + label: Browsertrix Version + description: This can be found in the bottom end of the Browsertrix web app. placeholder: "v1.5.0-beta.0-67d0c6a" validations: required: true @@ -17,27 +17,38 @@ body: attributes: label: What did you expect to happen? What happened instead? description: | - "I was trying to modify the Page Load Timeout value in a saved workflow, however..." - - Please submit any screenshots/videos that can be used to understand how to reproduce the issue. You can attach images by clicking this area to highlight it and then dragging files into the browser window. + A clear and concise description of the bug, and what you expected to happen instead. - If your problem is related to crawling, or something wasn't captured in the way you expect please include a link to the finished crawl/workflow if possible. + For issues related to crawling or replay, please include a link to the archived item and workflow when possible. validations: required: true # Step-by-step reproduction instructions - type: textarea attributes: - label: Step-by-step reproduction instructions + label: Reproduction instructions + description: Step-by-step description of how to reproduce the issue, including the page URL if applicable. placeholder: | 1. Navigate to... 2. Click on... 3. See error... validations: required: true + # Screenshots / videos + - type: textarea + attributes: + label: Screenshots / Video + description: Please attach any screenshots or screen recordings that demonstrate the bug. You can attach images by clicking this area to highlight it and then dragging files into the browser window. + # Environment + - type: input + attributes: + label: Environment + description: Please specify your browser if the issue is related to the web app, and provide information on your operating system if you're running Browsertrix locally. + placeholder: | + Browser: + Browser version: + OS: # Additional details - type: textarea attributes: label: Additional details - description: Add any other relevant information here, such as your local environment if you are running Browsertrix Cloud locally. - validations: - required: false + description: Any additional context that helps us investigate the issue. For example, does the issue only happen in a specific browser? Are there forum discussions related to your issue? etc. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 4eda5b88ba..f96130dd16 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -3,14 +3,14 @@ blank_issues_enabled: true contact_links: - name: Report a replay issue - about: Issues related to archived content not displaying properly should be reported in the ReplayWeb.page repo. + about: Issues related to an archived item or collection not replaying properly should be reported in the ReplayWeb.page repo. url: https://github.com/webrecorder/replayweb.page/issues/new?&labels=replay+bug%2Cbug&projects=&template=replay-bug.yml&title=[Replay+Bug]%3A+ - name: Report a security vulnerability - about: Please do not file an issue and instead email security@webrecorder.org. We will follow up with you there! + about: Please email security@webrecorder.org directly. We will follow up with you there! url: https://webrecorder.net/.well-known/security.txt - name: Get help on our forum url: https://forum.webrecorder.net/ about: Have a ("how do I...?") question? Not sure if your issue is reproducible? The best way to get help is on our community forum! - name: Check out the docs url: https://docs.browsertrix.cloud - about: Solutions to common questions may be available in the documentation! + about: Find solutions to common questions, such as how to install, develop, and deploy Browsertrix. \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature-change.yml b/.github/ISSUE_TEMPLATE/feature-change.yml index 2f171ebab3..8ce9df44fa 100644 --- a/.github/ISSUE_TEMPLATE/feature-change.yml +++ b/.github/ISSUE_TEMPLATE/feature-change.yml @@ -1,49 +1,20 @@ name: Feature / Change Request -description: If new things should be added or something that is working as intended should be changed, please file this type of issue! +description: Request a new feature or change to an existing feature of the app. title: "[Feature]: " labels: ["enhancement"] body: - # Context - - type: textarea - attributes: - label: Context - description: Describe any prior information that we are taking into account to inform this future development. - placeholder: "Now that x is done we should do y to accomplish z." - validations: - required: true # User story sentence - type: textarea attributes: label: What change would you like to see? - description: Describe the solution you'd like. If relevant, include ways in which you've tried to solve the issue with the current version. - placeholder: "As a user, I want to be able to ____ so that I can ____" + description: Describe the improvement or feature you'd like added to Browsertrix. + placeholder: I would like to be able to ____________ so that I can ____________. validations: required: true - # Requirements - - type: textarea - attributes: - label: Requirements - description: | - Intended primarily for use by Webrecorder team, leave blank if unknown. - - List the outcomes of the feature being implemented without design or implementation details. - placeholder: | - 1. Item metadata should show links to the collections that the item belongs to. - 2. Items can be added or removed from collections when editing an item. - validations: - required: false - # Todo + # Context - type: textarea attributes: - label: Todo - description: | - Intended primarily for use by Webrecorder team, leave blank if unknown. - - Any other linked issues / tasks to complete to implement this feature. - placeholder: | - - [ ] Mockups: - - [ ] Design: - - [ ] UI: - - [ ] API: + label: Context + description: Any background information that helps us understand the request. validations: - required: false + required: true \ No newline at end of file diff --git a/.github/workflows/deploy-dev.yaml b/.github/workflows/deploy-dev.yaml index 6d21a6604d..5f6261b84a 100644 --- a/.github/workflows/deploy-dev.yaml +++ b/.github/workflows/deploy-dev.yaml @@ -15,20 +15,45 @@ jobs: with: driver-opts: network=host - - name: Login to Regsitry + - name: Login to Registry uses: docker/login-action@v2 with: registry: ${{ secrets.DO_REGISTRY }} username: ${{ secrets.DO_API_TOKEN }} password: ${{ secrets.DO_API_TOKEN }} - - - name: Set Env Vars + - name: Set Env Vars run: | echo VERSION=`cat version.txt` >> $GITHUB_ENV echo GIT_COMMIT_HASH=`git rev-parse --short HEAD` >> $GITHUB_ENV echo GIT_BRANCH_NAME=`git rev-parse --abbrev-ref HEAD` >> $GITHUB_ENV + - name: Checkout values file from ops repo + uses: actions/checkout@v4 + with: + repository: "webrecorder/browsertrix-cloud-ops" + path: "browsertrix-cloud-ops" + ssh-key: ${{ secrets.DEPLOY_KEY_OPS_REPO }} + sparse-checkout: | + scripts/decrypt-values.py + values/btrix-dev-values.yml + poetry.lock + pyproject.toml + sparse-checkout-cone-mode: false + + - name: Install poetry + run: pipx install poetry + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "poetry" + + - name: Install vault decryption dependencies + working-directory: browsertrix-cloud-ops + run: | + poetry install + - name: Build Backend uses: docker/build-push-action@v3 with: @@ -57,12 +82,10 @@ jobs: - name: Get Kubeconfig env: KUBECONFIG_DATA: ${{ secrets.KUBECONFIG_DATA }} - DEV_VALUES: ${{ secrets.DEV_VALUES }} run: | printf "$KUBECONFIG_DATA" >> ./.kubeconfig chmod 400 ./.kubeconfig - printf "$DEV_VALUES" >> ./dev-values.yaml - name: Install Kubectl uses: azure/setup-kubectl@v3 @@ -72,6 +95,13 @@ jobs: with: version: 3.10.2 + - name: Decrypt values file + env: + ANSIBLE_VAULT_PASSWORD: ${{ secrets.ANSIBLE_VAULT_PASSWORD }} + working-directory: browsertrix-cloud-ops + run: | + poetry run python scripts/decrypt-values.py values/btrix-dev-values.yml ../dev-values.yaml + - name: Start Cluster with Helm run: | KUBECONFIG=./.kubeconfig helm upgrade --install -f ./chart/values.yaml -f ./dev-values.yaml btrix ./chart/ diff --git a/.github/workflows/frontend-build-check.yaml b/.github/workflows/frontend-build-check.yaml index da93d7ff1a..0d4bf7d684 100644 --- a/.github/workflows/frontend-build-check.yaml +++ b/.github/workflows/frontend-build-check.yaml @@ -15,7 +15,7 @@ jobs: - name: Setup Node uses: actions/setup-node@v3 with: - node-version: '16' + node-version: '18' cache: 'yarn' cache-dependency-path: frontend/yarn.lock - name: Restore cache @@ -30,6 +30,14 @@ jobs: env: HUSKY: 0 run: yarn install --frozen-lockfile + - name: Lint + working-directory: frontend + run: yarn lint:check + - name: Format + working-directory: frontend + # TODO Reenable when https://github.com/webrecorder/browsertrix-cloud/issues/1618 is addressed + # run: yarn format:check + run: yarn prettier --list-different . - name: Unit tests working-directory: frontend run: yarn test diff --git a/.github/workflows/publish-helm-chart.yaml b/.github/workflows/publish-helm-chart.yaml index e7e810819e..4f9e3ee814 100644 --- a/.github/workflows/publish-helm-chart.yaml +++ b/.github/workflows/publish-helm-chart.yaml @@ -4,6 +4,7 @@ on: push: branches: - main + - "*-release" jobs: package_chart: diff --git a/.github/workflows/ui-tests-playwright.yml b/.github/workflows/ui-tests-playwright.yml index 4fd201b8b2..4ae8484b8b 100644 --- a/.github/workflows/ui-tests-playwright.yml +++ b/.github/workflows/ui-tests-playwright.yml @@ -22,7 +22,7 @@ jobs: - name: Setup Node uses: actions/setup-node@v3 with: - node-version: '16' + node-version: '18' cache: 'yarn' cache-dependency-path: frontend/yarn.lock - name: Install dependencies diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 72a5ae839b..1d83ba443b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 22.12.0 + rev: 24.1.1 hooks: - id: black args: ["backend/btrixcloud/"] diff --git a/.vscode/extensions.json b/.vscode/extensions.json index ede834d5bd..58e22b1982 100644 --- a/.vscode/extensions.json +++ b/.vscode/extensions.json @@ -2,6 +2,7 @@ "recommendations": [ "dbaeumer.vscode-eslint", "esbenp.prettier-vscode", + "dbaeumer.vscode-eslint", "runem.lit-plugin", "bradlc.vscode-tailwindcss", "redhat.vscode-yaml", diff --git a/.vscode/settings.json b/.vscode/settings.json index e2ee7ebcc5..ae2e75fbc0 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -20,12 +20,16 @@ "Browsertrix", "btrix", "Elems", + "favicons", "hoverable", + "micromark", "novnc", "profileid", "tailwindcss", "wacz", "Webrecorder", + "wysimark", + "xstate", "zxcvbn" ], "cSpell.languageSettings": [ @@ -41,5 +45,8 @@ "css" ] } - ] + ], + "eslint.workingDirectories": ["./frontend"], + "eslint.nodePath": "./frontend/node_modules", + "tailwindCSS.experimental.classRegex": ["tw`([^`]*)"] } diff --git a/README.md b/README.md index fa086b3f68..4debb745d5 100644 --- a/README.md +++ b/README.md @@ -1,45 +1,42 @@ -# Browsertrix Cloud +
+ Browsertrix Logo +
-

+  -Browsertrix Cloud is an open-source cloud-native high-fidelity browser-based crawling service designed +Browsertrix is an open-source cloud-native high-fidelity browser-based crawling service designed to make web archiving easier and more accessible for everyone. -The service provides an API and UI for scheduling crawls and viewing results, -and managing all aspects of crawling process. This system provides the orchestration and management around crawling, -while the actual crawling is performed using -[Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) containers, which are launched for each crawl. +The service provides an API and UI for scheduling crawls and viewing results, and managing all aspects of crawling process. This system provides the orchestration and management around crawling, while the actual crawling is performed using [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) containers, which are launched for each crawl. -See [Browsertrix Cloud](https://browsertrix.com) for a feature overview and information about Browsertrix Cloud Hosting. +See [browsertrix.com](https://browsertrix.com) for a feature overview and information about Browsertrix hosting. ## Documentation -The full docs for using, deploying and developing Browsertrix Cloud are available at: [https://docs.browsertrix.cloud](https://docs.browsertrix.cloud) +The full docs for using, deploying, and developing Browsertrix are available at: [https://docs.browsertrix.cloud](https://docs.browsertrix.cloud) -## Deployment +## Deployment The latest deployment documentation is available at: [https://docs.browsertrix.cloud/deploy](https://docs.browsertrix.cloud/deploy) -The docs cover deploying Browsertrix Cloud in different environments using Kubernetes, from a single-node setup to scalable clusters in the cloud. +The docs cover deploying Browsertrix in different environments using Kubernetes, from a single-node setup to scalable clusters in the cloud. -Previously, Browsertrix Cloud also supported Docker Compose and podman-based deployment. This is now deprecated due to the complexity -of maintaining feature parity across different setups, and with various Kubernetes deployment options being available and easy to deploy, even on a single machine. +Previously, Browsertrix also supported Docker Compose and podman-based deployment. This has been deprecated due to the complexity of maintaining feature parity across different setups, and with various Kubernetes deployment options being available and easy to deploy, even on a single machine. -Making deployment of Browsertrix Cloud as easy as possible remains a key goal, and we welcome suggestions for how we can further improve our Kubernetes deployment options. +Making deployment of Browsertrix as easy as possible remains a key goal, and we welcome suggestions for how we can further improve our Kubernetes deployment options. If you are looking to just try running a single crawl, you may want to try [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) first to test out the crawling capabilities. ## Development Status -Browsertrix Cloud is currently in a beta, though the system and backend API is fairly stable, we are working on many additional features. +Browsertrix is currently in a beta, though the system and backend API is fairly stable, we are working on many additional features. Additional developer documentation is available at [https://docs.browsertrix.cloud/develop](https://docs.browsertrix.cloud/develop/) Please see the GitHub issues and [this GitHub Project](https://github.com/orgs/webrecorder/projects/9) for our current project plan and tasks. - ## License -Browsertrix Cloud is made available under the AGPLv3 License. +Browsertrix is made available under the AGPLv3 License. Documentation is made available under the Creative Commons Attribution 4.0 International License diff --git a/ansible/Pipfile.lock b/ansible/Pipfile.lock index e811d1d2e8..486f3aae20 100644 --- a/ansible/Pipfile.lock +++ b/ansible/Pipfile.lock @@ -140,72 +140,61 @@ }, "cffi": { "hashes": [ - "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5", - "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef", - "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104", - "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426", - "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405", - "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375", - "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a", - "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e", - "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc", - "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf", - "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185", - "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497", - "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3", - "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35", - "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c", - "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83", - "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21", - "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca", - "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984", - "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac", - "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd", - "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee", - "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a", - "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2", - "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192", - "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7", - "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585", - "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f", - "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e", - "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27", - "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b", - "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e", - "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e", - "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d", - "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c", - "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415", - "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82", - "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02", - "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314", - "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325", - "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c", - "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3", - "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914", - "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045", - "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d", - "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9", - "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5", - "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2", - "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c", - "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3", - "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2", - "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8", - "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d", - "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d", - "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9", - "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162", - "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76", - "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4", - "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e", - "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9", - "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6", - "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b", - "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01", - "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0" - ], - "version": "==1.15.1" + "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc", + "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a", + "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417", + "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab", + "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520", + "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36", + "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743", + "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8", + "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed", + "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684", + "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56", + "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324", + "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d", + "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235", + "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e", + "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088", + "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000", + "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7", + "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e", + "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673", + "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c", + "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe", + "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2", + "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098", + "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8", + "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a", + "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0", + "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b", + "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896", + "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e", + "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9", + "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2", + "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b", + "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6", + "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404", + "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f", + "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0", + "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4", + "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc", + "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936", + "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba", + "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872", + "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb", + "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614", + "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1", + "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d", + "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969", + "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b", + "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4", + "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627", + "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956", + "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357" + ], + "markers": "platform_python_implementation != 'PyPy'", + "version": "==1.16.0" }, "chardet": { "hashes": [ @@ -329,28 +318,42 @@ }, "cryptography": { "hashes": [ - "sha256:059e348f9a3c1950937e1b5d7ba1f8e968508ab181e75fc32b879452f08356db", - "sha256:1a5472d40c8f8e91ff7a3d8ac6dfa363d8e3138b961529c996f3e2df0c7a411a", - "sha256:1a8e6c2de6fbbcc5e14fd27fb24414507cb3333198ea9ab1258d916f00bc3039", - "sha256:1fee5aacc7367487b4e22484d3c7e547992ed726d14864ee33c0176ae43b0d7c", - "sha256:5d092fdfedaec4cbbffbf98cddc915ba145313a6fdaab83c6e67f4e6c218e6f3", - "sha256:5f0ff6e18d13a3de56f609dd1fd11470918f770c6bd5d00d632076c727d35485", - "sha256:7bfc55a5eae8b86a287747053140ba221afc65eb06207bedf6e019b8934b477c", - "sha256:7fa01527046ca5facdf973eef2535a27fec4cb651e4daec4d043ef63f6ecd4ca", - "sha256:8dde71c4169ec5ccc1087bb7521d54251c016f126f922ab2dfe6649170a3b8c5", - "sha256:8f4ab7021127a9b4323537300a2acfb450124b2def3756f64dc3a3d2160ee4b5", - "sha256:948224d76c4b6457349d47c0c98657557f429b4e93057cf5a2f71d603e2fc3a3", - "sha256:9a6c7a3c87d595608a39980ebaa04d5a37f94024c9f24eb7d10262b92f739ddb", - "sha256:b46e37db3cc267b4dea1f56da7346c9727e1209aa98487179ee8ebed09d21e43", - "sha256:b4ceb5324b998ce2003bc17d519080b4ec8d5b7b70794cbd2836101406a9be31", - "sha256:cb33ccf15e89f7ed89b235cff9d49e2e62c6c981a6061c9c8bb47ed7951190bc", - "sha256:d198820aba55660b4d74f7b5fd1f17db3aa5eb3e6893b0a41b75e84e4f9e0e4b", - "sha256:d34579085401d3f49762d2f7d6634d6b6c2ae1242202e860f4d26b046e3a1006", - "sha256:eb8163f5e549a22888c18b0d53d6bb62a20510060a22fd5a995ec8a05268df8a", - "sha256:f73bff05db2a3e5974a6fd248af2566134d8981fd7ab012e5dd4ddb1d9a70699" + "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b", + "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce", + "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88", + "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7", + "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20", + "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9", + "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff", + "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1", + "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764", + "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b", + "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298", + "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1", + "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824", + "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257", + "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a", + "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129", + "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb", + "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929", + "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854", + "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52", + "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923", + "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885", + "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0", + "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd", + "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2", + "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18", + "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b", + "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992", + "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74", + "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660", + "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925", + "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449" ], + "index": "pypi", "markers": "python_version >= '3.7'", - "version": "==41.0.1" + "version": "==42.0.4" }, "distro": { "hashes": [ diff --git a/assets/browsertrix-lockup-color-dynamic.svg b/assets/browsertrix-lockup-color-dynamic.svg new file mode 100644 index 0000000000..81b7e90425 --- /dev/null +++ b/assets/browsertrix-lockup-color-dynamic.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index c1ebc9c0ea..b8cd420ea5 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -1,4 +1,5 @@ """k8s background jobs""" + import asyncio from datetime import datetime from typing import Optional, Tuple, Union, List, Dict, TYPE_CHECKING, cast @@ -402,11 +403,11 @@ async def get_replica_job_file( profile = await self.profile_ops.get_profile(UUID(job.object_id), org) return BaseFile(**profile.resource.dict()) - item_res = await self.base_crawl_ops.get_crawl_raw(job.object_id, org) - matching_file = [ - f for f in item_res.get("files", []) if f["filename"] == job.file_path - ][0] - return BaseFile(**matching_file) + item_res = await self.base_crawl_ops.get_base_crawl(job.object_id, org) + matching_file = [f for f in item_res.files if f.filename == job.file_path][ + 0 + ] + return matching_file # pylint: disable=broad-exception-caught, raise-missing-from except Exception: raise HTTPException(status_code=404, detail="file_not_found") diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index af4f1c6720..67e6d96d41 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -2,10 +2,9 @@ import os from datetime import timedelta -from typing import Optional, List, Union, Type, TYPE_CHECKING +from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast from uuid import UUID import urllib.parse -import contextlib import asyncio from fastapi import HTTPException, Depends @@ -30,17 +29,17 @@ if TYPE_CHECKING: from .crawlconfigs import CrawlConfigOps - from .crawlmanager import CrawlManager from .users import UserManager from .orgs import OrgOps from .colls import CollectionOps from .storages import StorageOps from .webhooks import EventWebhookOps from .background_jobs import BackgroundJobOps + from .pages import PageOps else: - CrawlConfigOps = UserManager = OrgOps = CollectionOps = object - CrawlManager = StorageOps = EventWebhookOps = BackgroundJobOps = object + CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object + StorageOps = EventWebhookOps = BackgroundJobOps = object # Presign duration must be less than 604800 seconds (one week), # so set this one minute short of a week. @@ -56,20 +55,21 @@ class BaseCrawlOps: # pylint: disable=duplicate-code, too-many-arguments, too-many-locals crawl_configs: CrawlConfigOps - crawl_manager: CrawlManager user_manager: UserManager orgs: OrgOps colls: CollectionOps storage_ops: StorageOps event_webhook_ops: EventWebhookOps background_job_ops: BackgroundJobOps + page_ops: PageOps + + presign_duration: int def __init__( self, mdb, users: UserManager, orgs: OrgOps, - crawl_manager: CrawlManager, crawl_configs: CrawlConfigOps, colls: CollectionOps, storage_ops: StorageOps, @@ -77,7 +77,6 @@ def __init__( background_job_ops: BackgroundJobOps, ): self.crawls = mdb["crawls"] - self.crawl_manager = crawl_manager self.crawl_configs = crawl_configs self.user_manager = users self.orgs = orgs @@ -85,6 +84,7 @@ def __init__( self.storage_ops = storage_ops self.event_webhook_ops = event_webhook_ops self.background_job_ops = background_job_ops + self.page_ops = cast(PageOps, None) presign_duration_minutes = int( os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT @@ -94,13 +94,17 @@ def __init__( min(presign_duration_minutes, PRESIGN_MINUTES_MAX) * 60 ) + def set_page_ops(self, page_ops): + """set page ops reference""" + self.page_ops = page_ops + async def get_crawl_raw( self, crawlid: str, org: Optional[Organization] = None, type_: Optional[str] = None, project: Optional[dict[str, bool]] = None, - ): + ) -> Dict[str, Any]: """Get data for single crawl""" query: dict[str, object] = {"_id": crawlid} @@ -117,40 +121,61 @@ async def get_crawl_raw( return res - async def _files_to_resources(self, files, org, crawlid): + async def _files_to_resources( + self, + files: List[Dict], + org: Organization, + crawlid: str, + qa_run_id: Optional[str] = None, + ) -> List[CrawlFileOut]: if not files: return [] crawl_files = [CrawlFile(**data) for data in files] - return await self.resolve_signed_urls(crawl_files, org, crawl_id=crawlid) + return await self._resolve_signed_urls(crawl_files, org, crawlid, qa_run_id) + + async def get_wacz_files(self, crawl_id: str, org: Organization): + """Return list of WACZ files associated with crawl.""" + wacz_files = [] + crawl = await self.get_base_crawl(crawl_id, org) + for file_ in crawl.files: + if file_.filename.endswith(".wacz"): + wacz_files.append(file_) + return wacz_files - async def get_crawl( + async def get_base_crawl( self, crawlid: str, org: Optional[Organization] = None, type_: Optional[str] = None, - cls_type: Type[Union[CrawlOut, CrawlOutWithResources]] = CrawlOutWithResources, - ): - """Get data for single base crawl""" - res = await self.get_crawl_raw(crawlid, org, type_) - - if cls_type == CrawlOutWithResources: - res["resources"] = await self._files_to_resources( - res.get("files"), org, crawlid - ) + project: Optional[dict[str, bool]] = None, + ) -> BaseCrawl: + """Get crawl data for internal use""" + res = await self.get_crawl_raw(crawlid, org, type_, project) + return BaseCrawl.from_dict(res) - if res.get("collectionIds"): - res["collections"] = await self.colls.get_collection_names( - res.get("collectionIds") - ) + async def get_crawl_out( + self, + crawlid: str, + org: Optional[Organization] = None, + type_: Optional[str] = None, + skip_resources=False, + ) -> CrawlOutWithResources: + """Get crawl data for api output""" + res = await self.get_crawl_raw(crawlid, org, type_) - res.pop("files", None) + files = res.pop("files", None) res.pop("errors", None) - crawl = cls_type.from_dict(res) + if not skip_resources: + coll_ids = res.get("collectionIds") + if coll_ids: + res["collections"] = await self.colls.get_collection_names(coll_ids) + + crawl = CrawlOutWithResources.from_dict(res) - if crawl.type == "crawl": - crawl = await self._resolve_crawl_refs(crawl, org) + if not skip_resources: + crawl = await self._resolve_crawl_refs(crawl, org, files) if crawl.config and crawl.config.seeds: crawl.config.seeds = None @@ -161,23 +186,22 @@ async def get_crawl( return crawl - async def get_resource_resolved_raw_crawl( - self, crawlid: str, org: Organization, type_=None - ): - """return single base crawl with resources resolved""" - res = await self.get_crawl_raw(crawlid=crawlid, type_=type_, org=org) - res["resources"] = await self._files_to_resources( - res.get("files"), org, res["_id"] - ) - return res + async def get_internal_crawl_out(self, crawl_id): + """add internal prefix for relative paths""" + crawl_out = await self.get_crawl_out(crawl_id) + resources = crawl_out.resources or [] + for file_ in resources: + file_.path = self.storage_ops.resolve_internal_access_path(file_.path) + + return crawl_out async def _update_crawl_collections( self, crawl_id: str, org: Organization, collection_ids: List[UUID] ): """Update crawl collections to match updated list.""" - crawl = await self.get_crawl(crawl_id, org, cls_type=CrawlOut) + crawl = await self.get_crawl_out(crawl_id, org, skip_resources=True) - prior_coll_ids = set(crawl.collectionIds) + prior_coll_ids = set(crawl.collectionIds or []) updated_coll_ids = set(collection_ids) # Add new collections @@ -257,50 +281,7 @@ async def add_crawl_file_replica( ) async def shutdown_crawl(self, crawl_id: str, org: Organization, graceful: bool): - """stop or cancel specified crawl""" - crawl = await self.get_crawl_raw(crawl_id, org) - if crawl.get("type") != "crawl": - return - - result = None - try: - result = await self.crawl_manager.shutdown_crawl( - crawl_id, graceful=graceful - ) - - if result.get("success"): - if graceful: - await self.crawls.find_one_and_update( - {"_id": crawl_id, "type": "crawl", "oid": org.id}, - {"$set": {"stopping": True}}, - ) - return result - - except Exception as exc: - # pylint: disable=raise-missing-from - # if reached here, probably crawl doesn't exist anymore - raise HTTPException( - status_code=404, detail=f"crawl_not_found, (details: {exc})" - ) - - # if job no longer running, canceling is considered success, - # but graceful stoppage is not possible, so would be a failure - if result.get("error") == "Not Found": - if not graceful: - await self.update_crawl_state(crawl_id, "canceled") - crawl = await self.get_crawl_raw(crawl_id, org) - if not await self.crawl_configs.stats_recompute_last( - crawl["cid"], 0, -1 - ): - raise HTTPException( - status_code=404, - detail=f"crawl_config_not_found: {crawl['cid']}", - ) - - return {"success": True} - - # return whatever detail may be included in the response - raise HTTPException(status_code=400, detail=result) + """placeholder, implemented in crawls, base version does nothing""" async def delete_crawls( self, @@ -308,24 +289,24 @@ async def delete_crawls( delete_list: DeleteCrawlList, type_: str, user: Optional[User] = None, - ): + ) -> tuple[int, dict[UUID, dict[str, int]], bool]: """Delete a list of crawls by id for given org""" - cids_to_update: dict[str, dict[str, int]] = {} + cids_to_update: dict[UUID, dict[str, int]] = {} size = 0 for crawl_id in delete_list.crawl_ids: - crawl = await self.get_crawl_raw(crawl_id, org) - if crawl.get("type") != type_: + crawl = await self.get_base_crawl(crawl_id, org) + if crawl.type != type_: continue # Ensure user has appropriate permissions for all crawls in list: # - Crawler users can delete their own crawls # - Org owners can delete any crawls in org - if user and (crawl.get("userid") != user.id) and not org.is_owner(user): + if user and (crawl.userid != user.id) and not org.is_owner(user): raise HTTPException(status_code=403, detail="not_allowed") - if type_ == "crawl" and not crawl.get("finished"): + if type_ == "crawl" and not crawl.finished: try: await self.shutdown_crawl(crawl_id, org, graceful=False) except Exception as exc: @@ -334,10 +315,13 @@ async def delete_crawls( status_code=400, detail=f"Error Stopping Crawl: {exc}" ) + if type_ == "crawl": + await self.page_ops.delete_crawl_pages(crawl_id, org.id) + crawl_size = await self._delete_crawl_files(crawl, org) size += crawl_size - cid = crawl.get("cid") + cid = crawl.cid if cid: if cids_to_update.get(cid): cids_to_update[cid]["inc"] += 1 @@ -367,9 +351,8 @@ async def delete_crawls( return res.deleted_count, cids_to_update, quota_reached - async def _delete_crawl_files(self, crawl, org: Organization): + async def _delete_crawl_files(self, crawl: BaseCrawl, org: Organization): """Delete files associated with crawl from storage.""" - crawl = BaseCrawl.from_dict(crawl) size = 0 for file_ in crawl.files: size += file_.size @@ -381,12 +364,18 @@ async def _delete_crawl_files(self, crawl, org: Organization): return size + async def delete_crawl_files(self, crawl_id: str, oid: UUID): + """Delete crawl files""" + crawl = await self.get_base_crawl(crawl_id) + org = await self.orgs.get_org_by_id(oid) + return await self._delete_crawl_files(crawl, org) + async def _resolve_crawl_refs( self, crawl: Union[CrawlOut, CrawlOutWithResources], org: Optional[Organization], + files: Optional[list[dict]], add_first_seed: bool = True, - files: Optional[list[dict]] = None, ): """Resolve running crawl data""" # pylint: disable=too-many-branches @@ -395,6 +384,12 @@ async def _resolve_crawl_refs( config = await self.crawl_configs.get_crawl_config( crawl.cid, org.id if org else None, active_only=False ) + + if not org: + org = await self.orgs.get_org_by_id(crawl.oid) + if not org: + raise HTTPException(status_code=400, detail="missing_org") + if config and config.config.seeds: if add_first_seed: first_seed = config.config.seeds[0] @@ -415,17 +410,18 @@ async def _resolve_crawl_refs( return crawl - async def resolve_signed_urls( + async def _resolve_signed_urls( self, files: List[CrawlFile], org: Organization, - update_presigned_url: bool = False, crawl_id: Optional[str] = None, - ): + qa_run_id: Optional[str] = None, + update_presigned_url: bool = False, + ) -> List[CrawlFileOut]: """Regenerate presigned URLs for files as necessary""" if not files: print("no files") - return + return [] delta = timedelta(seconds=self.presign_duration_seconds) @@ -440,12 +436,17 @@ async def resolve_signed_urls( presigned_url = await self.storage_ops.get_presigned_url( org, file_, self.presign_duration_seconds ) + + prefix = "files" + if qa_run_id: + prefix = f"qaFinished.{qa_run_id}.{prefix}" + await self.crawls.find_one_and_update( - {"files.filename": file_.filename}, + {f"{prefix}.filename": file_.filename}, { "$set": { - "files.$.presignedUrl": presigned_url, - "files.$.expireAt": exp, + f"{prefix}.$.presignedUrl": presigned_url, + f"{prefix}.$.expireAt": exp, } }, ) @@ -470,25 +471,13 @@ async def resolve_signed_urls( return out_files - @contextlib.asynccontextmanager - async def get_redis(self, crawl_id): - """get redis url for crawl id""" - redis_url = self.crawl_manager.get_redis_url(crawl_id) - - redis = await self.crawl_manager.get_redis_client(redis_url) - - try: - yield redis - finally: - await redis.close() - async def add_to_collection( self, crawl_ids: List[str], collection_id: UUID, org: Organization ): """Add crawls to collection.""" for crawl_id in crawl_ids: - crawl_raw = await self.get_crawl_raw(crawl_id, org) - crawl_collections = crawl_raw.get("collectionIds") + crawl = await self.get_base_crawl(crawl_id, org) + crawl_collections = crawl.collectionIds if crawl_collections and crawl_id in crawl_collections: raise HTTPException( status_code=400, detail="crawl_already_in_collection" @@ -638,11 +627,10 @@ async def delete_crawls_all_types( uploads: list[str] = [] for crawl_id in delete_list.crawl_ids: - crawl = await self.get_crawl_raw(crawl_id, org) - type_ = crawl.get("type") - if type_ == "crawl": + crawl = await self.get_base_crawl(crawl_id, org) + if crawl.type == "crawl": crawls.append(crawl_id) - if type_ == "upload": + if crawl.type == "upload": uploads.append(crawl_id) crawls_length = len(crawls) @@ -793,7 +781,7 @@ async def get_all_crawls_search_values( response_model=CrawlOutWithResources, ) async def get_base_crawl(crawl_id: str, org: Organization = Depends(org_crawl_dep)): - return await ops.get_crawl(crawl_id, org) + return await ops.get_crawl_out(crawl_id, org) @app.get( "/orgs/all/all-crawls/{crawl_id}/replay.json", @@ -804,15 +792,15 @@ async def get_base_crawl_admin(crawl_id, user: User = Depends(user_dep)): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_crawl(crawl_id, None) + return await ops.get_crawl_out(crawl_id, None) @app.get( "/orgs/{oid}/all-crawls/{crawl_id}/replay.json", tags=["all-crawls"], response_model=CrawlOutWithResources, ) - async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_crawl(crawl_id, org) + async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): + return await ops.get_crawl_out(crawl_id, org) @app.patch("/orgs/{oid}/all-crawls/{crawl_id}", tags=["all-crawls"]) async def update_crawl( diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index aa2a05358e..6a23fe50d2 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -1,6 +1,7 @@ """ Collections API """ + from collections import Counter from datetime import datetime from uuid import UUID, uuid4 @@ -19,6 +20,7 @@ CollIdName, UpdateColl, AddRemoveCrawlList, + BaseCrawl, CrawlOutWithResources, Organization, PaginatedResponse, @@ -329,17 +331,18 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): total_size = 0 tags = [] - async for crawl in self.crawls.find({"collectionIds": collection_id}): - if crawl["state"] not in SUCCESSFUL_STATES: + async for crawl_raw in self.crawls.find({"collectionIds": collection_id}): + crawl = BaseCrawl.from_dict(crawl_raw) + if crawl.state not in SUCCESSFUL_STATES: continue crawl_count += 1 - files = crawl.get("files", []) + files = crawl.files or [] for file in files: - total_size += file.get("size", 0) - if crawl.get("stats"): - page_count += crawl.get("stats", {}).get("done", 0) - if crawl.get("tags"): - tags.extend(crawl.get("tags")) + total_size += file.size + if crawl.stats: + page_count += crawl.stats.done + if crawl.tags: + tags.extend(crawl.tags) sorted_tags = [tag for tag, count in Counter(tags).most_common()] diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index fdb8e77009..3fb1d09700 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1,6 +1,7 @@ """ Crawl Config API handling """ + # pylint: disable=too-many-lines from typing import List, Union, Optional, Tuple, TYPE_CHECKING, cast @@ -23,6 +24,7 @@ CrawlConfig, CrawlConfigOut, CrawlConfigIdNameOut, + CrawlOut, EmptyStr, UpdateCrawlConfig, Organization, @@ -32,7 +34,7 @@ CrawlerChannel, CrawlerChannels, ) -from .utils import dt_now +from .utils import dt_now, slug_from_name if TYPE_CHECKING: from .orgs import OrgOps @@ -231,6 +233,7 @@ async def add_crawl_config( run_now=run_now, out_filename=out_filename, profile_filename=profile_filename or "", + warc_prefix=self.get_warc_prefix(org, crawlconfig), ) if crawl_id and run_now: @@ -297,6 +300,7 @@ async def readd_configmap( run_now=False, out_filename=self.default_filename_template, profile_filename=profile_filename or "", + warc_prefix=self.get_warc_prefix(org, crawlconfig), ) async def update_crawl_config( @@ -556,7 +560,9 @@ async def get_crawl_config_ids_for_profile( results = [CrawlConfigIdNameOut.from_dict(res) for res in results] return results - async def get_running_crawl(self, crawlconfig: CrawlConfig): + async def get_running_crawl( + self, crawlconfig: Union[CrawlConfig, CrawlConfigOut] + ) -> Optional[CrawlOut]: """Return the id of currently running crawl for this config, if any""" # crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id) crawls, _ = await self.crawl_ops.list_crawls( @@ -616,13 +622,15 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1): return result is not None - def _add_curr_crawl_stats(self, crawlconfig, crawl): + def _add_curr_crawl_stats( + self, crawlconfig: CrawlConfigOut, crawl: Optional[CrawlOut] + ): """Add stats from current running crawl, if any""" if not crawl: return crawlconfig.lastCrawlState = crawl.state - crawlconfig.lastCrawlSize = crawl.stats.get("size", 0) if crawl.stats else 0 + crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0 crawlconfig.lastCrawlStopping = crawl.stopping async def get_crawl_config_out(self, cid: UUID, org: Organization): @@ -811,8 +819,9 @@ async def get_crawl_config_search_values(self, org): "workflowIds": workflow_ids, } - async def run_now(self, cid: UUID, org: Organization, user: User): - """run specified crawlconfig now""" + async def prepare_for_run_crawl(self, cid: UUID, org: Organization) -> CrawlConfig: + """prepare for running a crawl, returning crawlconfig and + validating that running crawls is allowed""" crawlconfig = await self.get_crawl_config(cid, org.id) if not crawlconfig: @@ -820,11 +829,6 @@ async def run_now(self, cid: UUID, org: Organization, user: User): status_code=404, detail=f"Crawl Config '{cid}' not found" ) - if await self.get_running_crawl(crawlconfig): - raise HTTPException(status_code=400, detail="crawl_already_running") - - crawl_id = None - # ensure crawlconfig exists try: await self.crawl_manager.get_configmap(crawlconfig.id) @@ -838,9 +842,21 @@ async def run_now(self, cid: UUID, org: Organization, user: User): if await self.org_ops.exec_mins_quota_reached(org.id): raise HTTPException(status_code=403, detail="exec_minutes_quota_reached") + return crawlconfig + + async def run_now(self, cid: UUID, org: Organization, user: User): + """run specified crawlconfig now""" + crawlconfig = await self.prepare_for_run_crawl(cid, org) + + if await self.get_running_crawl(crawlconfig): + raise HTTPException(status_code=400, detail="crawl_already_running") + try: crawl_id = await self.crawl_manager.create_crawl_job( - crawlconfig, org.storage, userid=str(user.id) + crawlconfig, + org.storage, + userid=str(user.id), + warc_prefix=self.get_warc_prefix(org, crawlconfig), ) await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True) return crawl_id @@ -896,6 +912,21 @@ def get_channel_crawler_image( """Get crawler image name by id""" return self.crawler_images_map.get(crawler_channel or "") + def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: + """Generate WARC prefix slug from org slug, name or url + if no name is provided, hostname is used from url, otherwise + url is ignored""" + name = crawlconfig.name + if not name: + if crawlconfig.config.seeds and len(crawlconfig.config.seeds): + url = crawlconfig.config.seeds[0].url + parts = urllib.parse.urlsplit(url) + name = parts.netloc + + name = slug_from_name(name or "") + prefix = org.slug + "-" + name + return prefix[:80] + # ============================================================================ # pylint: disable=too-many-locals diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index fe455e8341..536da58274 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -96,9 +96,11 @@ async def run_replica_job( "replica_secret_name": replica_storage.get_storage_secret_name(oid), "replica_file_path": replica_file_path, "replica_endpoint": replica_endpoint, - "primary_secret_name": primary_storage.get_storage_secret_name(oid) - if primary_storage - else None, + "primary_secret_name": ( + primary_storage.get_storage_secret_name(oid) + if primary_storage + else None + ), "primary_file_path": primary_file_path if primary_file_path else None, "primary_endpoint": primary_endpoint if primary_endpoint else None, "BgJobType": BgJobType, @@ -117,6 +119,7 @@ async def add_crawl_config( run_now: bool, out_filename: str, profile_filename: str, + warc_prefix: str, ) -> Optional[str]: """add new crawl, store crawl config in configmap""" @@ -137,7 +140,10 @@ async def add_crawl_config( if run_now: crawl_id = await self.create_crawl_job( - crawlconfig, storage, str(crawlconfig.modifiedBy) + crawlconfig, + storage, + str(crawlconfig.modifiedBy), + warc_prefix, ) await self._update_scheduled_job(crawlconfig) @@ -149,6 +155,7 @@ async def create_crawl_job( crawlconfig: CrawlConfig, storage: StorageRef, userid: str, + warc_prefix: str, ) -> str: """create new crawl job from config""" cid = str(crawlconfig.id) @@ -167,6 +174,38 @@ async def create_crawl_job( crawlconfig.crawlTimeout, crawlconfig.maxCrawlSize, manual=True, + warc_prefix=warc_prefix, + ) + + async def create_qa_crawl_job( + self, + crawlconfig: CrawlConfig, + storage: StorageRef, + userid: str, + qa_source: str, + ) -> str: + """create new QA Run crawl job with qa source crawl id""" + cid = str(crawlconfig.id) + + storage_secret = storage.get_storage_secret_name(str(crawlconfig.oid)) + + await self.has_storage_secret(storage_secret) + + ts_now = dt_now().strftime("%Y%m%d%H%M%S") + crawl_id = f"qa-{ts_now}-{cid[:12]}" + + return await self.new_crawl_job( + cid, + userid, + crawlconfig.oid, + storage, + crawlconfig.crawlerChannel, + 1, + 0, + 0, + warc_prefix="qa", + crawl_id=crawl_id, + qa_source=qa_source, ) async def update_crawl_config( diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 2d3788e97c..4711bec709 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -1,12 +1,15 @@ """ Crawl API """ + # pylint: disable=too-many-lines import json import re +import contextlib import urllib.parse +from datetime import datetime from uuid import UUID -from typing import Optional, List, Dict, Union +from typing import Optional, List, Dict, Union, Any from fastapi import Depends, HTTPException from fastapi.responses import StreamingResponse @@ -16,19 +19,27 @@ from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import dt_now, parse_jsonl_error_messages, stream_dict_list_as_csv from .basecrawls import BaseCrawlOps +from .crawlmanager import CrawlManager from .models import ( UpdateCrawl, DeleteCrawlList, CrawlConfig, UpdateCrawlConfig, CrawlScale, + CrawlStats, + CrawlFile, Crawl, CrawlOut, CrawlOutWithResources, + QARun, + QARunOut, + QARunWithResources, + DeleteQARunList, Organization, User, PaginatedResponse, RUNNING_AND_STARTING_STATES, + SUCCESSFUL_STATES, ALL_CRAWL_STATES, ) @@ -38,13 +49,15 @@ # ============================================================================ +# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods class CrawlOps(BaseCrawlOps): """Crawl Ops""" - # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods + crawl_manager: CrawlManager - def __init__(self, *args): + def __init__(self, crawl_manager: CrawlManager, *args): super().__init__(*args) + self.crawl_manager = crawl_manager self.crawl_configs.set_crawl_ops(self) self.colls.set_crawl_ops(self) self.event_webhook_ops.set_crawl_ops(self) @@ -75,6 +88,28 @@ async def init_index(self): await self.crawls.create_index([("state", pymongo.HASHED)]) await self.crawls.create_index([("fileSize", pymongo.DESCENDING)]) + async def get_crawl( + self, + crawlid: str, + org: Optional[Organization] = None, + project: Optional[dict[str, bool]] = None, + ) -> Crawl: + """Get crawl data for internal use""" + res = await self.get_crawl_raw(crawlid, org, "crawl", project) + return Crawl.from_dict(res) + + @contextlib.asynccontextmanager + async def get_redis(self, crawl_id): + """get redis url for crawl id""" + redis_url = self.crawl_manager.get_redis_url(crawl_id) + + redis = await self.crawl_manager.get_redis_client(redis_url) + + try: + yield redis + finally: + await redis.close() + async def list_crawls( self, org: Optional[Organization] = None, @@ -192,7 +227,7 @@ async def list_crawls( crawl = cls.from_dict(result) files = result.get("files") if resources else None crawl = await self._resolve_crawl_refs( - crawl, org, add_first_seed=False, files=files + crawl, org, files=files, add_first_seed=False ) crawls.append(crawl) @@ -221,16 +256,6 @@ async def delete_crawls( return {"deleted": True, "storageQuotaReached": quota_reached} - async def get_wacz_files(self, crawl_id: str, org: Organization): - """Return list of WACZ files associated with crawl.""" - wacz_files = [] - crawl_raw = await self.get_crawl_raw(crawl_id, org) - crawl = Crawl.from_dict(crawl_raw) - for file_ in crawl.files: - if file_.filename.endswith(".wacz"): - wacz_files.append(file_) - return wacz_files - # pylint: disable=too-many-arguments async def add_new_crawl( self, @@ -277,16 +302,15 @@ async def add_new_crawl( return dt_now except pymongo.errors.DuplicateKeyError: - # print(f"Crawl Already Added: {crawl.id} - {crawl.state}") return None async def update_crawl_scale( self, crawl_id: str, org: Organization, crawl_scale: CrawlScale, user: User ): """Update crawl scale in the db""" - crawl = await self.get_crawl_raw(crawl_id, org) + crawl = await self.get_crawl(crawl_id, org) update = UpdateCrawlConfig(scale=crawl_scale.scale) - await self.crawl_configs.update_crawl_config(crawl["cid"], org, user, update) + await self.crawl_configs.update_crawl_config(crawl.cid, org, user, update) result = await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl", "oid": org.id}, @@ -383,35 +407,15 @@ async def match_crawl_queue(self, crawl_id, regex, offset=0): return {"total": total, "matched": matched, "nextOffset": next_offset} - async def get_errors_from_redis( - self, crawl_id: str, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1 - ): - """Get crawl errors from Redis and optionally store in mongodb.""" - # Zero-index page for query - page = page - 1 - skip = page * page_size - upper_bound = skip + page_size - 1 - - async with self.get_redis(crawl_id) as redis: - try: - errors = await redis.lrange(f"{crawl_id}:e", skip, upper_bound) - total = await redis.llen(f"{crawl_id}:e") - except exceptions.ConnectionError: - # pylint: disable=raise-missing-from - raise HTTPException(status_code=503, detail="error_logs_not_available") - - parsed_errors = parse_jsonl_error_messages(errors) - return parsed_errors, total - async def add_or_remove_exclusion(self, crawl_id, regex, org, user, add): """add new exclusion to config or remove exclusion from config for given crawl_id, update config on crawl""" - crawl_raw = await self.get_crawl_raw(crawl_id, org, project={"cid": True}) + crawl = await self.get_crawl(crawl_id, org, project={"cid": True}) - cid = crawl_raw.get("cid") + cid = crawl.cid - scale = crawl_raw.get("scale", 1) + scale = crawl.scale or 1 async with self.get_redis(crawl_id) as redis: query = { @@ -435,50 +439,111 @@ async def add_or_remove_exclusion(self, crawl_id, regex, org, user, add): return {"success": True} async def update_crawl_state_if_allowed( - self, crawl_id, state, allowed_from, **kwargs + self, + crawl_id: str, + is_qa: bool, + state: str, + allowed_from: List[str], + finished: Optional[datetime] = None, + stats: Optional[CrawlStats] = None, ): """update crawl state and other properties in db if state has changed""" - kwargs["state"] = state - query = {"_id": crawl_id, "type": "crawl"} + prefix = "" if not is_qa else "qa." + + update: Dict[str, Any] = {f"{prefix}state": state} + if finished: + update[f"{prefix}finished"] = finished + if stats: + update[f"{prefix}stats"] = stats.dict() + + query: Dict[str, Any] = {"_id": crawl_id, "type": "crawl"} if allowed_from: - query["state"] = {"$in": allowed_from} + query[f"{prefix}state"] = {"$in": allowed_from} - return await self.crawls.find_one_and_update(query, {"$set": kwargs}) + return await self.crawls.find_one_and_update(query, {"$set": update}) - async def update_running_crawl_stats(self, crawl_id, stats): + async def update_running_crawl_stats( + self, crawl_id: str, is_qa: bool, stats: CrawlStats + ): """update running crawl stats""" - query = {"_id": crawl_id, "type": "crawl", "state": "running"} - return await self.crawls.find_one_and_update(query, {"$set": {"stats": stats}}) + prefix = "" if not is_qa else "qa." + query = {"_id": crawl_id, "type": "crawl", f"{prefix}state": "running"} + return await self.crawls.find_one_and_update( + query, {"$set": {f"{prefix}stats": stats.dict()}} + ) - async def inc_crawl_exec_time(self, crawl_id, exec_time): + async def inc_crawl_exec_time( + self, + crawl_id: str, + is_qa: bool, + exec_time, + last_updated_time, + ): """increment exec time""" + # update both crawl-shared qa exec seconds and per-qa run exec seconds + if is_qa: + inc_update = { + "qaCrawlExecSeconds": exec_time, + "qa.crawlExecSeconds": exec_time, + } + else: + inc_update = {"crawlExecSeconds": exec_time} + return await self.crawls.find_one_and_update( - {"_id": crawl_id, "type": "crawl"}, - {"$inc": {"crawlExecSeconds": exec_time}}, + { + "_id": crawl_id, + "type": "crawl", + "_lut": {"$ne": last_updated_time}, + }, + { + "$inc": inc_update, + "$set": {"_lut": last_updated_time}, + }, ) - async def get_crawl_state(self, crawl_id): + async def get_crawl_exec_last_update_time(self, crawl_id): + """get crawl last updated time""" + res = await self.crawls.find_one( + {"_id": crawl_id, "type": "crawl"}, projection=["_lut"] + ) + return res and res.get("_lut") + + async def get_crawl_state(self, crawl_id: str, is_qa: bool): """return current crawl state of a crawl""" + prefix = "" if not is_qa else "qa." + res = await self.crawls.find_one( - {"_id": crawl_id}, projection=["state", "finished"] + {"_id": crawl_id}, + projection={"state": f"${prefix}state", "finished": f"${prefix}finished"}, ) if not res: return None, None return res.get("state"), res.get("finished") - async def add_crawl_errors(self, crawl_id, errors): - """add crawl errors from redis to mongodb errors field""" + async def add_crawl_error( + self, + crawl_id: str, + is_qa: bool, + error: str, + ): + """add crawl error from redis to mongodb errors field""" + prefix = "" if not is_qa else "qa." + await self.crawls.find_one_and_update( - {"_id": crawl_id}, {"$push": {"errors": {"$each": errors}}} + {"_id": crawl_id}, {"$push": {f"{prefix}errors": error}} ) - async def add_crawl_file(self, crawl_id, crawl_file, size): + async def add_crawl_file( + self, crawl_id: str, is_qa: bool, crawl_file: CrawlFile, size: int + ): """add new crawl file to crawl""" + prefix = "" if not is_qa else "qa." + await self.crawls.find_one_and_update( {"_id": crawl_id}, { - "$push": {"files": crawl_file.dict()}, - "$inc": {"fileCount": 1, "fileSize": size}, + "$push": {f"{prefix}files": crawl_file.dict()}, + "$inc": {f"{prefix}fileCount": 1, f"{prefix}fileSize": size}, }, ) @@ -493,9 +558,10 @@ async def get_crawl_seeds( skip = (page - 1) * page_size upper_bound = skip + page_size - crawl_raw = await self.get_crawl_raw(crawl_id, org) + crawl = await self.get_crawl(crawl_id, org) + if not crawl.config or not crawl.config.seeds: + return [], 0 try: - crawl = Crawl.from_dict(crawl_raw) return crawl.config.seeds[skip:upper_bound], len(crawl.config.seeds) # pylint: disable=broad-exception-caught except Exception: @@ -515,60 +581,261 @@ async def get_crawl_stats( if org: query["oid"] = org.id - async for crawl in self.crawls.find(query): + async for crawl_raw in self.crawls.find(query): + crawl = Crawl.from_dict(crawl_raw) data: Dict[str, Union[str, int]] = {} - data["id"] = str(crawl.get("_id")) + data["id"] = crawl.id - oid = crawl.get("oid") - data["oid"] = str(oid) - data["org"] = org_slugs[oid] + data["oid"] = str(crawl.oid) + data["org"] = org_slugs[crawl.oid] - data["cid"] = str(crawl.get("cid")) - crawl_name = crawl.get("name") - data["name"] = f'"{crawl_name}"' if crawl_name else "" - data["state"] = crawl.get("state") + data["cid"] = crawl.id + data["name"] = f'"{crawl.name}"' if crawl.name else "" + data["state"] = crawl.state - userid = crawl.get("userid") - data["userid"] = str(userid) - data["user"] = user_emails.get(userid) + data["userid"] = str(crawl.userid) + data["user"] = user_emails.get(crawl.userid) - started = crawl.get("started") - finished = crawl.get("finished") - - data["started"] = str(started) - data["finished"] = str(finished) + data["started"] = str(crawl.started) + data["finished"] = str(crawl.finished) data["duration"] = 0 - if started and finished: - duration = finished - started + duration_seconds = 0 + if crawl.started and crawl.finished: + duration = crawl.finished - crawl.started duration_seconds = int(duration.total_seconds()) if duration_seconds: data["duration"] = duration_seconds - done_stats = None - if crawl.get("stats") and crawl.get("stats").get("done"): - done_stats = crawl["stats"]["done"] - - data["pages"] = 0 - if done_stats: - data["pages"] = done_stats + if crawl.stats: + data["pages"] = crawl.stats.done - data["filesize"] = crawl.get("fileSize", 0) + data["filesize"] = crawl.fileSize data["avg_page_time"] = 0 - if ( - done_stats - and done_stats != 0 - and started - and finished - and duration_seconds - ): - data["avg_page_time"] = int(duration_seconds / done_stats) + if crawl.stats and crawl.stats.done != 0 and duration_seconds: + data["avg_page_time"] = int(duration_seconds / crawl.stats.done) crawls_data.append(data) return crawls_data + async def shutdown_crawl( + self, crawl_id: str, org: Organization, graceful: bool + ) -> Dict[str, bool]: + """stop or cancel specified crawl""" + crawl = await self.get_base_crawl(crawl_id, org) + if crawl and crawl.type != "crawl": + raise HTTPException(status_code=400, detail="not_a_crawl") + + result = None + try: + result = await self.crawl_manager.shutdown_crawl( + crawl_id, graceful=graceful + ) + + if result.get("success"): + if graceful: + await self.crawls.find_one_and_update( + {"_id": crawl_id, "type": "crawl", "oid": org.id}, + {"$set": {"stopping": True}}, + ) + return result + + except Exception as exc: + # pylint: disable=raise-missing-from + # if reached here, probably crawl doesn't exist anymore + raise HTTPException( + status_code=404, detail=f"crawl_not_found, (details: {exc})" + ) + + # if job no longer running, canceling is considered success, + # but graceful stoppage is not possible, so would be a failure + if result.get("error") == "Not Found": + if not graceful: + await self.update_crawl_state(crawl_id, "canceled") + crawl = await self.get_crawl(crawl_id, org) + if not await self.crawl_configs.stats_recompute_last(crawl.cid, 0, -1): + raise HTTPException( + status_code=404, + detail=f"crawl_config_not_found: {crawl.cid}", + ) + + return {"success": True} + + # return whatever detail may be included in the response + raise HTTPException(status_code=400, detail=result) + + async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User): + """Start crawl QA run""" + + crawl = await self.get_crawl(crawl_id, org) + + # can only QA finished crawls + if not crawl.finished: + raise HTTPException(status_code=400, detail="crawl_not_finished") + + # can only QA successfully finished crawls + if crawl.state not in SUCCESSFUL_STATES: + raise HTTPException(status_code=400, detail="crawl_did_not_succeed") + + # can only run one QA at a time + if crawl.qa: + raise HTTPException(status_code=400, detail="qa_already_running") + + # not a valid crawl + if not crawl.cid or crawl.type != "crawl": + raise HTTPException(status_code=400, detail="invalid_crawl_for_qa") + + crawlconfig = await self.crawl_configs.prepare_for_run_crawl(crawl.cid, org) + + try: + qa_run_id = await self.crawl_manager.create_qa_crawl_job( + crawlconfig, + org.storage, + userid=str(user.id), + qa_source=crawl_id, + ) + + image = self.crawl_configs.get_channel_crawler_image( + crawlconfig.crawlerChannel + ) + + qa_run = QARun( + id=qa_run_id, + started=datetime.now(), + userid=user.id, + userName=user.name, + state="starting", + image=image, + ) + + await self.crawls.find_one_and_update( + {"_id": crawl_id}, + { + "$set": { + "qa": qa_run.dict(), + } + }, + ) + + return qa_run_id + + except Exception as exc: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") + + async def stop_crawl_qa_run(self, crawl_id: str, org: Organization): + """Stop crawl QA run, QA run removed when actually finished""" + crawl = await self.get_crawl(crawl_id, org) + + if not crawl.qa: + raise HTTPException(status_code=400, detail="qa_not_running") + + try: + result = await self.crawl_manager.shutdown_crawl(crawl.qa.id, graceful=True) + + if result.get("error") == "Not Found": + # treat as success, qa crawl no longer exists, so mark as no qa + result = {"success": True} + + return result + + except Exception as exc: + # pylint: disable=raise-missing-from + # if reached here, probably crawl doesn't exist anymore + raise HTTPException( + status_code=404, detail=f"crawl_not_found, (details: {exc})" + ) + + async def delete_crawl_qa_runs(self, crawl_id: str, delete_list: DeleteQARunList): + """delete specified finished QA run""" + + count = 0 + for qa_run_id in delete_list.qa_run_ids: + res = await self.crawls.find_one_and_update( + {"_id": crawl_id, "type": "crawl"}, + {"$unset": {f"qaFinished.{qa_run_id}": ""}}, + ) + + if res: + count += 1 + + await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id) + + return {"deleted": count} + + async def qa_run_finished(self, crawl_id: str): + """clear active qa, add qa run to finished list, if successful""" + crawl = await self.get_crawl(crawl_id) + + if not crawl.qa: + return False + + query: Dict[str, Any] = {"qa": None} + + if crawl.qa.finished and crawl.qa.state in SUCCESSFUL_STATES: + query[f"qaFinished.{crawl.qa.id}"] = crawl.qa.dict() + + if await self.crawls.find_one_and_update( + {"_id": crawl_id, "type": "crawl"}, {"$set": query} + ): + return True + + return False + + async def get_qa_runs( + self, crawl_id: str, org: Optional[Organization] = None + ) -> List[QARunOut]: + """Return list of QA runs""" + crawl_data = await self.get_crawl_raw( + crawl_id, org, "crawl", project={"qaFinished": True, "qa": True} + ) + qa_finished = crawl_data.get("qaFinished") or {} + all_qa = [QARunOut(**qa_run_data) for qa_run_data in qa_finished.values()] + all_qa.sort(key=lambda x: x.finished or dt_now(), reverse=True) + qa = crawl_data.get("qa") + if qa: + all_qa.insert(0, QARunOut(**qa)) + return all_qa + + async def get_active_qa( + self, crawl_id: str, org: Optional[Organization] = None + ) -> Optional[QARunOut]: + """return just the active QA, if any""" + crawl_data = await self.get_crawl_raw( + crawl_id, org, "crawl", project={"qa": True} + ) + qa = crawl_data.get("qa") + return QARunOut(**qa) if qa else None + + async def get_qa_run_for_replay( + self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None + ) -> QARunWithResources: + """Fetch QA runs with resources for replay.json""" + crawl = await self.get_crawl(crawl_id, org) + qa_finished = crawl.qaFinished or {} + qa_run = qa_finished.get(qa_run_id) + + if not qa_run: + raise HTTPException(status_code=404, detail="crawl_qa_not_found") + + if not org: + org = await self.orgs.get_org_by_id(crawl.oid) + if not org: + raise HTTPException(status_code=400, detail="missing_org") + + resources = await self._resolve_signed_urls( + qa_run.files, org, crawl.id, qa_run_id + ) + + qa_run.files = [] + + qa_run_dict = qa_run.dict() + qa_run_dict["resources"] = resources + + return QARunWithResources(**qa_run_dict) + # ============================================================================ async def recompute_crawl_file_count_and_size(crawls, crawl_id): @@ -590,11 +857,11 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id): # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, too-many-statements -def init_crawls_api(app, user_dep, *args): +def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args): """API for crawl management, including crawl done callback""" # pylint: disable=invalid-name, duplicate-code - ops = CrawlOps(*args) + ops = CrawlOps(crawl_manager, *args) org_viewer_dep = ops.orgs.org_viewer_dep org_crawl_dep = ops.orgs.org_crawl_dep @@ -744,15 +1011,81 @@ async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_crawl(crawl_id, None, "crawl") + return await ops.get_crawl_out(crawl_id, None, "crawl") @app.get( "/orgs/{oid}/crawls/{crawl_id}/replay.json", tags=["crawls"], response_model=CrawlOutWithResources, ) - async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_crawl(crawl_id, org, "crawl") + async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): + return await ops.get_crawl_out(crawl_id, org, "crawl") + + # QA APIs + # --------------------- + @app.get( + "/orgs/all/crawls/{crawl_id}/qa/{qa_run_id}/replay.json", + tags=["qa"], + response_model=QARunWithResources, + ) + async def get_qa_run_admin(crawl_id, qa_run_id, user: User = Depends(user_dep)): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + return await ops.get_qa_run_for_replay(crawl_id, qa_run_id) + + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/replay.json", + tags=["qa"], + response_model=QARunWithResources, + ) + async def get_qa_run( + crawl_id, qa_run_id, org: Organization = Depends(org_viewer_dep) + ): + return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org) + + @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["qa"]) + async def start_crawl_qa_run( + crawl_id: str, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), + ): + qa_run_id = await ops.start_crawl_qa_run(crawl_id, org, user) + return {"started": qa_run_id} + + @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/stop", tags=["qa"]) + async def stop_crawl_qa_run( + crawl_id: str, org: Organization = Depends(org_crawl_dep) + ): + # pylint: disable=unused-argument + return await ops.stop_crawl_qa_run(crawl_id, org) + + @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/delete", tags=["qa"]) + async def delete_crawl_qa_runs( + crawl_id: str, + qa_run_ids: DeleteQARunList, + org: Organization = Depends(org_crawl_dep), + ): + # pylint: disable=unused-argument + return await ops.delete_crawl_qa_runs(crawl_id, qa_run_ids) + + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/qa", + tags=["qa"], + response_model=List[QARunOut], + ) + async def get_qa_runs(crawl_id, org: Organization = Depends(org_viewer_dep)): + return await ops.get_qa_runs(crawl_id, org) + + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/qa/activeQA", + tags=["qa"], + response_model=Dict[str, Optional[QARunOut]], + ) + async def get_active_qa(crawl_id, org: Organization = Depends(org_viewer_dep)): + return {"qa": await ops.get_active_qa(crawl_id, org)} + + # ---- @app.get( "/orgs/all/crawls/{crawl_id}", @@ -889,7 +1222,7 @@ async def stream_crawl_logs( logLevel: Optional[str] = None, context: Optional[str] = None, ): - crawl = await ops.get_crawl(crawl_id, org, "crawl") + crawl = await ops.get_crawl_out(crawl_id, org) log_levels = [] contexts = [] @@ -898,11 +1231,10 @@ async def stream_crawl_logs( if context: contexts = context.split(",") - # If crawl is finished, stream logs from WACZ files + # If crawl is finished, stream logs from WACZ files using presigned urls if crawl.finished: - wacz_files = await ops.get_wacz_files(crawl_id, org) resp = await ops.storage_ops.sync_stream_wacz_logs( - org, wacz_files, log_levels, contexts + crawl.resources or [], log_levels, contexts ) return StreamingResponse( resp, @@ -924,18 +1256,13 @@ async def get_crawl_errors( page: int = 1, org: Organization = Depends(org_viewer_dep), ): - crawl_raw = await ops.get_crawl_raw(crawl_id, org) - crawl = Crawl.from_dict(crawl_raw) + crawl = await ops.get_crawl(crawl_id, org) - if crawl.finished: - skip = (page - 1) * pageSize - upper_bound = skip + pageSize - errors = crawl.errors[skip:upper_bound] - parsed_errors = parse_jsonl_error_messages(errors) - total = len(crawl.errors) - return paginated_format(parsed_errors, total, page, pageSize) - - errors, total = await ops.get_errors_from_redis(crawl_id, pageSize, page) - return paginated_format(errors, total, page, pageSize) + skip = (page - 1) * pageSize + upper_bound = skip + pageSize + + errors = crawl.errors[skip:upper_bound] if crawl.errors else [] + parsed_errors = parse_jsonl_error_messages(errors) + return paginated_format(parsed_errors, len(crawl.errors or []), page, pageSize) return ops diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index d91c6d2a80..93207ea12a 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -1,13 +1,14 @@ """ Browsertrix API Mongo DB initialization """ + import importlib.util import os import urllib import asyncio from uuid import UUID -from typing import Optional, Union +from typing import Optional, Union, TypeVar, Type import motor.motor_asyncio from pydantic import BaseModel @@ -79,6 +80,7 @@ async def update_and_prepare_db( coll_ops, invite_ops, storage_ops, + page_ops, db_inited, ): """Prepare database for application. @@ -91,10 +93,16 @@ async def update_and_prepare_db( """ await ping_db(mdb) print("Database setup started", flush=True) - if await run_db_migrations(mdb, user_manager): + if await run_db_migrations(mdb, user_manager, page_ops): await drop_indexes(mdb) await create_indexes( - org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops, user_manager + org_ops, + crawl_ops, + crawl_config_ops, + coll_ops, + invite_ops, + user_manager, + page_ops, ) await user_manager.create_super_user() await org_ops.create_default_org() @@ -104,7 +112,7 @@ async def update_and_prepare_db( # ============================================================================ -async def run_db_migrations(mdb, user_manager): +async def run_db_migrations(mdb, user_manager, page_ops): """Run database migrations.""" # if first run, just set version and exit @@ -136,7 +144,7 @@ async def run_db_migrations(mdb, user_manager): assert spec.loader migration_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(migration_module) - migration = migration_module.Migration(mdb) + migration = migration_module.Migration(mdb, page_ops=page_ops) if await migration.run(): migrations_run = True except ImportError as err: @@ -184,7 +192,7 @@ async def drop_indexes(mdb): # ============================================================================ # pylint: disable=too-many-arguments async def create_indexes( - org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops, user_manager + org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops, user_manager, page_ops ): """Create database indexes.""" print("Creating database indexes", flush=True) @@ -194,6 +202,11 @@ async def create_indexes( await coll_ops.init_index() await invite_ops.init_index() await user_manager.init_index() + await page_ops.init_index() + + +# ============================================================================ +T = TypeVar("T") # ============================================================================ @@ -208,10 +221,10 @@ def id_str(self): return str(self.id) @classmethod - def from_dict(cls, data): + def from_dict(cls: Type[T], data: dict) -> T: """convert dict from mongo to a class""" if not data: - return None + return cls() data["id"] = data.pop("_id") return cls(**data) diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index c2d70f4458..578c22692e 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -1,4 +1,5 @@ """ K8S API Access """ + import os import traceback @@ -29,7 +30,9 @@ def __init__(self): self.namespace = os.environ.get("CRAWLER_NAMESPACE") or "crawlers" self.custom_resources = {} - self.templates = Jinja2Templates(directory=get_templates_dir()) + self.templates = Jinja2Templates( + directory=get_templates_dir(), autoescape=False + ) config.load_incluster_config() self.client = client @@ -66,7 +69,10 @@ def get_redis_url(self, crawl_id): async def get_redis_client(self, redis_url): """return redis client with correct params for one-time use""" return aioredis.from_url( - redis_url, decode_responses=True, auto_close_connection_pool=True + redis_url, + decode_responses=True, + auto_close_connection_pool=True, + socket_timeout=20, ) # pylint: disable=too-many-arguments, too-many-locals @@ -82,6 +88,8 @@ def new_crawl_job_yaml( max_crawl_size=0, manual=True, crawl_id=None, + warc_prefix="", + qa_source="", ): """load job template from yaml""" if not crawl_id: @@ -100,6 +108,8 @@ def new_crawl_job_yaml( "storage_name": str(storage), "manual": "1" if manual else "0", "crawler_channel": crawler_channel, + "warc_prefix": warc_prefix, + "qa_source": qa_source, } data = self.templates.env.get_template("crawl_job.yaml").render(params) @@ -167,12 +177,14 @@ async def has_storage_secret(self, storage_secret) -> bool: async def delete_crawl_job(self, crawl_id): """delete custom crawljob object""" try: + name = f"crawljob-{crawl_id}" + await self.custom_api.delete_namespaced_custom_object( group="btrix.cloud", version="v1", namespace=self.namespace, plural="crawljobs", - name=f"crawljob-{crawl_id}", + name=name, grace_period_seconds=0, # delete as background to allow operator to do proper cleanup propagation_policy="Background", @@ -210,20 +222,17 @@ async def get_profile_browser(self, browserid): ) async def _patch_job(self, crawl_id, body, pluraltype="crawljobs") -> dict: - content_type = self.api_client.default_headers.get("Content-Type") - try: - self.api_client.set_default_header( - "Content-Type", "application/merge-patch+json" - ) + name = f"{pluraltype[:-1]}-{crawl_id}" await self.custom_api.patch_namespaced_custom_object( group="btrix.cloud", version="v1", namespace=self.namespace, plural=pluraltype, - name=f"{pluraltype[:-1]}-{crawl_id}", + name=name, body={"spec": body}, + _content_type="application/merge-patch+json", ) return {"success": True} # pylint: disable=broad-except @@ -231,12 +240,6 @@ async def _patch_job(self, crawl_id, body, pluraltype="crawljobs") -> dict: traceback.print_exc() return {"error": str(exc)} - finally: - if content_type: - self.api_client.set_default_header("Content-Type", content_type) - else: - del self.api_client.default_headers["Content-Type"] - async def print_pod_logs(self, pod_names, lines=100): """print pod logs""" for pod in pod_names: diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 2908d6b42f..4378a8701b 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -2,6 +2,7 @@ main file for browsertrix-api system supports docker and kubernetes based deployments of multiple browsertrix-crawlers """ + import os import asyncio import sys @@ -28,6 +29,7 @@ from .basecrawls import init_base_crawls_api from .webhooks import init_event_webhooks_api from .background_jobs import init_background_jobs_api +from .pages import init_pages_api from .crawlmanager import CrawlManager from .utils import run_once_lock, register_exit_handler, is_bool @@ -65,6 +67,7 @@ def main(): os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120) ), "maxPagesPerCrawl": int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)), + "maxScale": int(os.environ.get("MAX_CRAWL_SCALE", 3)), } invites = init_invites(mdb, email) @@ -131,10 +134,10 @@ def main(): base_crawl_init = ( app, current_active_user, + # to basecrawls mdb, user_manager, org_ops, - crawl_manager, crawl_config_ops, coll_ops, storage_ops, @@ -144,7 +147,14 @@ def main(): base_crawl_ops = init_base_crawls_api(*base_crawl_init) - crawls = init_crawls_api(*base_crawl_init) + crawls = init_crawls_api(crawl_manager, *base_crawl_init) + + page_ops = init_pages_api( + app, mdb, crawls, org_ops, storage_ops, current_active_user + ) + + base_crawl_ops.set_page_ops(page_ops) + crawls.set_page_ops(page_ops) init_uploads_api(*base_crawl_init) @@ -168,6 +178,7 @@ def main(): coll_ops, invites, storage_ops, + page_ops, db_inited, ) ) @@ -188,12 +199,20 @@ async def get_settings(): async def openapi() -> JSONResponse: return JSONResponse(app_root.openapi()) - @app_root.get("/healthz", include_in_schema=False) - async def healthz(): + # Used for startup + # Returns 200 only when db is available + migrations are done + @app_root.get("/healthzStartup", include_in_schema=False) + async def healthz_startup(): if not db_inited.get("inited"): raise HTTPException(status_code=503, detail="not_ready_yet") return {} + # Used for readiness + liveness + # Always returns 200 while running + @app_root.get("/healthz", include_in_schema=False) + async def healthz(): + return {} + app_root.include_router(app, prefix=API_PREFIX) diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index 751c17527e..2b2119821e 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -21,6 +21,7 @@ from .storages import init_storages_api from .webhooks import EventWebhookOps from .background_jobs import BackgroundJobOps +from .pages import PageOps app_root = FastAPI() @@ -76,10 +77,10 @@ def main(): coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops) crawl_ops = CrawlOps( + crawl_manager, mdb, user_manager, org_ops, - crawl_manager, crawl_config_ops, coll_ops, storage_ops, @@ -87,6 +88,10 @@ def main(): background_job_ops, ) + page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) + + crawl_ops.set_page_ops(page_ops) + background_job_ops.set_ops(crawl_ops, profile_ops) return init_operator_api( @@ -98,6 +103,7 @@ def main(): storage_ops, event_webhook_ops, background_job_ops, + page_ops, ) @@ -106,5 +112,5 @@ def main(): async def startup(): """init on startup""" register_exit_handler() - oper = main() - await oper.async_init() + settings = main() + await settings.async_init() diff --git a/backend/btrixcloud/migrations/__init__.py b/backend/btrixcloud/migrations/__init__.py index 7488769f6c..e983072b23 100644 --- a/backend/btrixcloud/migrations/__init__.py +++ b/backend/btrixcloud/migrations/__init__.py @@ -1,6 +1,7 @@ """ BaseMigration class to subclass in each migration module """ + import os from pymongo.errors import OperationFailure diff --git a/backend/btrixcloud/migrations/migration_0001_archives_to_orgs.py b/backend/btrixcloud/migrations/migration_0001_archives_to_orgs.py index 79b7139b99..49742d2fe7 100644 --- a/backend/btrixcloud/migrations/migration_0001_archives_to_orgs.py +++ b/backend/btrixcloud/migrations/migration_0001_archives_to_orgs.py @@ -1,6 +1,7 @@ """ Migration 0001 - Archives to Orgs """ + import os from pymongo.errors import OperationFailure @@ -23,8 +24,9 @@ class Migration(BaseMigration): "profiles", ] - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up.""" diff --git a/backend/btrixcloud/migrations/migration_0002_crawlconfig_crawlstats.py b/backend/btrixcloud/migrations/migration_0002_crawlconfig_crawlstats.py index 3f24b6fab6..ad148c93e0 100644 --- a/backend/btrixcloud/migrations/migration_0002_crawlconfig_crawlstats.py +++ b/backend/btrixcloud/migrations/migration_0002_crawlconfig_crawlstats.py @@ -1,6 +1,7 @@ """ Migration 0002 - Dropping CrawlConfig crawl stats """ + from btrixcloud.migrations import BaseMigration @@ -10,8 +11,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py b/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py index f773316c9a..c0427bc582 100644 --- a/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py +++ b/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py @@ -1,6 +1,7 @@ """ Migration 0003 - Mutable crawl configs and crawl revision history """ + from datetime import datetime from btrixcloud.models import Crawl, CrawlConfig @@ -13,8 +14,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0004_config_seeds.py b/backend/btrixcloud/migrations/migration_0004_config_seeds.py index a4e9f66e09..8f6e5ff6c2 100644 --- a/backend/btrixcloud/migrations/migration_0004_config_seeds.py +++ b/backend/btrixcloud/migrations/migration_0004_config_seeds.py @@ -1,6 +1,7 @@ """ Migration 0004 - Ensuring all config.seeds are Seeds not HttpUrls """ + from pydantic import HttpUrl from btrixcloud.models import Crawl, CrawlConfig, ScopeType, Seed @@ -13,8 +14,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. @@ -100,12 +102,14 @@ async def migrate_up(self): # Test migration async for config_dict in crawl_configs.find({}): config = CrawlConfig.from_dict(config_dict) - for seed in config.config.seeds: + seeds = config.config.seeds or [] + for seed in seeds: assert isinstance(seed, Seed) assert seed.url async for crawl_dict in crawls.find({}): crawl = Crawl.from_dict(crawl_dict) - for seed in crawl.config.seeds: + seeds = crawl.config.seeds or [] + for seed in seeds: assert isinstance(seed, Seed) assert seed.url diff --git a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py index 7134f63c04..ab225e54e7 100644 --- a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py +++ b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py @@ -1,6 +1,7 @@ """ Migration 0005 - Updating scheduled cron jobs after Operator changes """ + from btrixcloud.models import CrawlConfig, UpdateCrawlConfig from btrixcloud.crawlmanager import CrawlManager from btrixcloud.migrations import BaseMigration @@ -12,8 +13,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py b/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py index f920bd64a3..3af7ebddca 100644 --- a/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py +++ b/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py @@ -1,6 +1,7 @@ """ Migration 0006 - Precomputing workflow crawl stats """ + from btrixcloud.crawlconfigs import stats_recompute_all from btrixcloud.migrations import BaseMigration @@ -11,8 +12,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py b/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py index ec5d2e94dd..f5b0efe0d0 100644 --- a/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py +++ b/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py @@ -4,6 +4,7 @@ - Rename colls to autoAddCollections - Re-calculate workflow crawl stats to populate crawlSuccessfulCount """ + from btrixcloud.crawlconfigs import stats_recompute_all from btrixcloud.migrations import BaseMigration @@ -14,8 +15,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up.""" diff --git a/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py b/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py index 1bc8db2ac4..63856e543e 100644 --- a/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py +++ b/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py @@ -1,6 +1,7 @@ """ Migration 0008 - Precomputing crawl file stats """ + from btrixcloud.crawls import recompute_crawl_file_count_and_size from btrixcloud.migrations import BaseMigration @@ -11,8 +12,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0009_crawl_types.py b/backend/btrixcloud/migrations/migration_0009_crawl_types.py index 54c6f490d7..08e5bc60a5 100644 --- a/backend/btrixcloud/migrations/migration_0009_crawl_types.py +++ b/backend/btrixcloud/migrations/migration_0009_crawl_types.py @@ -1,6 +1,7 @@ """ Migration 0009 - Crawl types """ + from btrixcloud.migrations import BaseMigration @@ -10,8 +11,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0010_collection_total_size.py b/backend/btrixcloud/migrations/migration_0010_collection_total_size.py index e2a2eb3d2b..8e6234954a 100644 --- a/backend/btrixcloud/migrations/migration_0010_collection_total_size.py +++ b/backend/btrixcloud/migrations/migration_0010_collection_total_size.py @@ -1,6 +1,7 @@ """ Migration 0010 - Precomputing collection total size """ + from btrixcloud.colls import CollectionOps from btrixcloud.migrations import BaseMigration @@ -11,8 +12,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0011_crawl_timeout_configmap.py b/backend/btrixcloud/migrations/migration_0011_crawl_timeout_configmap.py index 3a9b2abd81..6793e90f38 100644 --- a/backend/btrixcloud/migrations/migration_0011_crawl_timeout_configmap.py +++ b/backend/btrixcloud/migrations/migration_0011_crawl_timeout_configmap.py @@ -1,6 +1,7 @@ """ Migration 0011 - Remove None CRAWL_TIMEOUT values from configmaps """ + import os from btrixcloud.k8sapi import K8sAPI @@ -14,8 +15,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0012_notes_to_description.py b/backend/btrixcloud/migrations/migration_0012_notes_to_description.py index b6fb6d70b8..7e33763421 100644 --- a/backend/btrixcloud/migrations/migration_0012_notes_to_description.py +++ b/backend/btrixcloud/migrations/migration_0012_notes_to_description.py @@ -1,6 +1,7 @@ """ Migration 0012 - Notes to description """ + from btrixcloud.migrations import BaseMigration @@ -10,8 +11,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0013_crawl_name.py b/backend/btrixcloud/migrations/migration_0013_crawl_name.py index 3c120f88d0..6bf57fefa7 100644 --- a/backend/btrixcloud/migrations/migration_0013_crawl_name.py +++ b/backend/btrixcloud/migrations/migration_0013_crawl_name.py @@ -1,6 +1,7 @@ """ Migration 0013 - Copy config name to crawls """ + from btrixcloud.migrations import BaseMigration @@ -10,8 +11,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py b/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py index 8243136c33..ac18163a02 100644 --- a/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py +++ b/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py @@ -1,6 +1,7 @@ """ Migration 0014 - collections to collectionIDs """ + from btrixcloud.migrations import BaseMigration @@ -10,8 +11,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py b/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py index c1582ee406..dd006966d0 100644 --- a/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py +++ b/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py @@ -1,6 +1,7 @@ """ Migration 0015 - Calculate and store org storage usage """ + from btrixcloud.migrations import BaseMigration @@ -11,8 +12,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py b/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py index b1cd648ed5..0e8ff0a449 100644 --- a/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py +++ b/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py @@ -1,6 +1,7 @@ """ Migration 0016 - Updating scheduled cron jobs after Operator changes v2 """ + import os from btrixcloud.models import CrawlConfig, UpdateCrawlConfig from btrixcloud.crawlmanager import CrawlManager @@ -13,8 +14,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0017_storage_by_type.py b/backend/btrixcloud/migrations/migration_0017_storage_by_type.py index ae2b6e6fd0..2549ec6f4c 100644 --- a/backend/btrixcloud/migrations/migration_0017_storage_by_type.py +++ b/backend/btrixcloud/migrations/migration_0017_storage_by_type.py @@ -1,6 +1,7 @@ """ Migration 0017 - Calculate and store org storage usage by type """ + from btrixcloud.migrations import BaseMigration @@ -11,8 +12,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0018_usernames.py b/backend/btrixcloud/migrations/migration_0018_usernames.py index 9b6afe794a..5129a02853 100644 --- a/backend/btrixcloud/migrations/migration_0018_usernames.py +++ b/backend/btrixcloud/migrations/migration_0018_usernames.py @@ -1,6 +1,7 @@ """ Migration 0018 - Store crawl and workflow userName directly in db """ + from btrixcloud.migrations import BaseMigration from btrixcloud.emailsender import EmailSender @@ -15,8 +16,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0019_org_slug.py b/backend/btrixcloud/migrations/migration_0019_org_slug.py index defe4dfcad..ec3e08cd88 100644 --- a/backend/btrixcloud/migrations/migration_0019_org_slug.py +++ b/backend/btrixcloud/migrations/migration_0019_org_slug.py @@ -1,6 +1,7 @@ """ Migration 0019 - Organization slug """ + from btrixcloud.migrations import BaseMigration from btrixcloud.utils import slug_from_name @@ -11,8 +12,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0020_org_storage_refs.py b/backend/btrixcloud/migrations/migration_0020_org_storage_refs.py index 291d79d633..9704b31bc9 100644 --- a/backend/btrixcloud/migrations/migration_0020_org_storage_refs.py +++ b/backend/btrixcloud/migrations/migration_0020_org_storage_refs.py @@ -1,6 +1,7 @@ """ Migration 0020 - New Storage Ref System """ + from btrixcloud.migrations import BaseMigration @@ -10,8 +11,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0021_profile_filenames.py b/backend/btrixcloud/migrations/migration_0021_profile_filenames.py index 1f9cd4c71e..56be52ab75 100644 --- a/backend/btrixcloud/migrations/migration_0021_profile_filenames.py +++ b/backend/btrixcloud/migrations/migration_0021_profile_filenames.py @@ -1,6 +1,7 @@ """ Migration 0021 - Profile filenames """ + from btrixcloud.crawlmanager import CrawlManager from btrixcloud.migrations import BaseMigration from btrixcloud.models import CrawlConfig, Profile, UpdateCrawlConfig @@ -13,8 +14,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0022_partial_complete.py b/backend/btrixcloud/migrations/migration_0022_partial_complete.py index 7b90bbb525..88c2190d4b 100644 --- a/backend/btrixcloud/migrations/migration_0022_partial_complete.py +++ b/backend/btrixcloud/migrations/migration_0022_partial_complete.py @@ -1,6 +1,7 @@ """ Migration 0022 -- Partial Complete """ + from btrixcloud.migrations import BaseMigration @@ -10,8 +11,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0023_available_extra_exec_mins.py b/backend/btrixcloud/migrations/migration_0023_available_extra_exec_mins.py index b0ac3d98f0..fa50f35247 100644 --- a/backend/btrixcloud/migrations/migration_0023_available_extra_exec_mins.py +++ b/backend/btrixcloud/migrations/migration_0023_available_extra_exec_mins.py @@ -1,6 +1,7 @@ """ Migration 0023 -- Available extra/gifted minutes """ + from btrixcloud.migrations import BaseMigration @@ -10,8 +11,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0024_crawlerchannel.py b/backend/btrixcloud/migrations/migration_0024_crawlerchannel.py index 6ffa4214c1..afbb1ae7af 100644 --- a/backend/btrixcloud/migrations/migration_0024_crawlerchannel.py +++ b/backend/btrixcloud/migrations/migration_0024_crawlerchannel.py @@ -1,6 +1,7 @@ """ Migration 0024 -- crawlerChannel """ + from btrixcloud.crawlmanager import CrawlManager from btrixcloud.migrations import BaseMigration from btrixcloud.models import CrawlConfig, UpdateCrawlConfig @@ -12,8 +13,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/migrations/migration_0025_workflow_db_configmap_fixes.py b/backend/btrixcloud/migrations/migration_0025_workflow_db_configmap_fixes.py index 6c159e69b5..170318efbf 100644 --- a/backend/btrixcloud/migrations/migration_0025_workflow_db_configmap_fixes.py +++ b/backend/btrixcloud/migrations/migration_0025_workflow_db_configmap_fixes.py @@ -1,6 +1,7 @@ """ Migration 0025 -- fix workflow database and configmap issues. """ + from btrixcloud.crawlmanager import CrawlManager from btrixcloud.migrations import BaseMigration from btrixcloud.models import CrawlConfig, UpdateCrawlConfig @@ -12,8 +13,9 @@ class Migration(BaseMigration): """Migration class.""" - def __init__(self, mdb, migration_version=MIGRATION_VERSION): - super().__init__(mdb, migration_version) + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) async def migrate_up(self): """Perform migration up. diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index c14340e3e3..3621032831 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -387,6 +387,8 @@ def get_raw_config(self): class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional): """Crawl Config Output""" + id: UUID + lastCrawlStopping: Optional[bool] = False profileName: Optional[str] firstSeed: Optional[str] @@ -523,36 +525,70 @@ class CrawlFileOut(BaseModel): # ============================================================================ -class BaseCrawl(BaseMongoModel): - """Base Crawl object (representing crawls, uploads and manual sessions)""" +class ReviewStatus(str, Enum): + """QA review statuses""" - id: str + GOOD = "good" + ACCEPTABLE = "acceptable" + FAILURE = "failure" + + +# ============================================================================ +class CrawlStats(BaseModel): + """Crawl Stats for pages and size""" + + found: int = 0 + done: int = 0 + size: int = 0 - type: str + +# ============================================================================ +class CoreCrawlable(BaseModel): + # pylint: disable=too-few-public-methods + """Core properties for crawlable run (crawl or qa run)""" + + id: str userid: UUID userName: Optional[str] - oid: UUID started: datetime finished: Optional[datetime] = None - name: Optional[str] = "" - state: str - stats: Optional[Dict[str, int]] = None + crawlExecSeconds: int = 0 - files: Optional[List[CrawlFile]] = [] + image: Optional[str] - description: Optional[str] = "" + stats: Optional[CrawlStats] = CrawlStats() + + files: List[CrawlFile] = [] + + fileSize: int = 0 + fileCount: int = 0 errors: Optional[List[str]] = [] + +# ============================================================================ +class BaseCrawl(CoreCrawlable, BaseMongoModel): + """Base Crawl object (representing crawls, uploads and manual sessions)""" + + type: str + + oid: UUID + cid: Optional[UUID] = None + + name: Optional[str] = "" + + description: Optional[str] = "" + + tags: Optional[List[str]] = [] + collectionIds: Optional[List[UUID]] = [] - fileSize: int = 0 - fileCount: int = 0 + reviewStatus: Optional[ReviewStatus] = None # ============================================================================ @@ -587,7 +623,7 @@ class CrawlOut(BaseMongoModel): state: str - stats: Optional[Dict[str, int]] + stats: Optional[CrawlStats] fileSize: int = 0 fileCount: int = 0 @@ -599,6 +635,7 @@ class CrawlOut(BaseMongoModel): collectionIds: Optional[List[UUID]] = [] crawlExecSeconds: int = 0 + qaCrawlExecSeconds: int = 0 # automated crawl fields config: Optional[RawCrawlConfig] @@ -617,6 +654,8 @@ class CrawlOut(BaseMongoModel): crawlerChannel: str = "default" image: Optional[str] + reviewStatus: Optional[ReviewStatus] = None + # ============================================================================ class CrawlOutWithResources(CrawlOut): @@ -634,6 +673,7 @@ class UpdateCrawl(BaseModel): description: Optional[str] tags: Optional[List[str]] collectionIds: Optional[List[UUID]] + reviewStatus: Optional[ReviewStatus] # ============================================================================ @@ -643,6 +683,13 @@ class DeleteCrawlList(BaseModel): crawl_ids: List[str] +# ============================================================================ +class DeleteQARunList(BaseModel): + """delete qa run list POST body""" + + qa_run_ids: List[str] + + # ============================================================================ ### AUTOMATED CRAWLS ### @@ -655,6 +702,36 @@ class CrawlScale(BaseModel): scale: conint(ge=1, le=MAX_CRAWL_SCALE) = 1 # type: ignore +# ============================================================================ +class QARun(CoreCrawlable, BaseModel): + """Subdocument to track QA runs for given crawl""" + + +# ============================================================================ +class QARunWithResources(QARun): + """QA crawl output model including resources""" + + resources: Optional[List[CrawlFileOut]] = [] + + +# ============================================================================ +class QARunOut(BaseModel): + """QA Run Output""" + + id: str + + userName: Optional[str] + + started: datetime + finished: Optional[datetime] = None + + state: str + + crawlExecSeconds: int = 0 + + stats: CrawlStats = CrawlStats() + + # ============================================================================ class Crawl(BaseCrawl, CrawlConfigCore): """Store State of a Crawl (Finished or Running)""" @@ -672,9 +749,10 @@ class Crawl(BaseCrawl, CrawlConfigCore): stopping: Optional[bool] = False - crawlExecSeconds: int = 0 + qaCrawlExecSeconds: int = 0 - image: Optional[str] + qa: Optional[QARun] = None + qaFinished: Optional[Dict[str, QARun]] = {} # ============================================================================ @@ -704,8 +782,6 @@ class UploadedCrawl(BaseCrawl): type: Literal["upload"] = "upload" - tags: Optional[List[str]] = [] - # ============================================================================ class UpdateUpload(UpdateCrawl): @@ -896,8 +972,15 @@ class OrgOut(BaseMongoModel): storageQuotaReached: Optional[bool] execMinutesQuotaReached: Optional[bool] + # total usage and exec time usage: Optional[Dict[str, int]] crawlExecSeconds: Dict[str, int] = {} + + # qa only usage + exec time + qaUsage: Optional[Dict[str, int]] = {} + qaCrawlExecSeconds: Dict[str, int] = {} + + # exec time limits monthlyExecSeconds: Dict[str, int] = {} extraExecSeconds: Dict[str, int] = {} giftedExecSeconds: Dict[str, int] = {} @@ -931,8 +1014,15 @@ class Organization(BaseMongoModel): bytesStoredUploads: int = 0 bytesStoredProfiles: int = 0 + # total usage + exec time usage: Dict[str, int] = {} crawlExecSeconds: Dict[str, int] = {} + + # qa only usage + exec time + qaUsage: Dict[str, int] = {} + qaCrawlExecSeconds: Dict[str, int] = {} + + # exec time limits monthlyExecSeconds: Dict[str, int] = {} extraExecSeconds: Dict[str, int] = {} giftedExecSeconds: Dict[str, int] = {} @@ -1263,27 +1353,27 @@ class BaseCollectionItemBody(WebhookNotificationBody): class CollectionItemAddedBody(BaseCollectionItemBody): """Webhook notification POST body for collection additions""" - event: Literal[ + event: Literal[WebhookEventType.ADDED_TO_COLLECTION] = ( WebhookEventType.ADDED_TO_COLLECTION - ] = WebhookEventType.ADDED_TO_COLLECTION + ) # ============================================================================ class CollectionItemRemovedBody(BaseCollectionItemBody): """Webhook notification POST body for collection removals""" - event: Literal[ + event: Literal[WebhookEventType.REMOVED_FROM_COLLECTION] = ( WebhookEventType.REMOVED_FROM_COLLECTION - ] = WebhookEventType.REMOVED_FROM_COLLECTION + ) # ============================================================================ class CollectionDeletedBody(WebhookNotificationBody): """Webhook notification base POST body for collection changes""" - event: Literal[ + event: Literal[WebhookEventType.COLLECTION_DELETED] = ( WebhookEventType.COLLECTION_DELETED - ] = WebhookEventType.COLLECTION_DELETED + ) collectionId: str @@ -1414,3 +1504,110 @@ class AnyJob(BaseModel): """Union of all job types, for response model""" __root__: Union[CreateReplicaJob, DeleteReplicaJob, BackgroundJob] + + +# ============================================================================ + +### PAGES ### + + +# ============================================================================ +class PageReviewUpdate(BaseModel): + """Update model for page manual review/approval""" + + approved: Optional[bool] = None + + +# ============================================================================ +class PageNoteIn(BaseModel): + """Input model for adding page notes""" + + text: str + + +# ============================================================================ +class PageNoteEdit(BaseModel): + """Input model for editing page notes""" + + id: UUID + text: str + + +# ============================================================================ +class PageNoteDelete(BaseModel): + """Delete model for page notes""" + + delete_list: List[UUID] = [] + + +# ============================================================================ +class PageNote(BaseModel): + """Model for page notes, tracking user and time""" + + id: UUID + text: str + created: datetime = datetime.now() + userid: UUID + userName: str + + +# ============================================================================ +class PageQACompare(BaseModel): + """Model for updating pages from QA run""" + + screenshotMatch: Optional[float] = None + textMatch: Optional[float] = None + resourceCounts: Optional[Dict[str, int]] + + +# ============================================================================ +class Page(BaseMongoModel): + """Core page data, no QA""" + + id: UUID + + oid: UUID + crawl_id: str + + # core page data + url: AnyHttpUrl + title: Optional[str] = None + ts: Optional[datetime] = None + loadState: Optional[int] = None + status: Optional[int] = None + + # manual review + userid: Optional[UUID] = None + modified: Optional[datetime] = None + approved: Optional[bool] = None + notes: List[PageNote] = [] + + +# ============================================================================ +class PageWithAllQA(Page): + """Model for core page data + qa""" + + # automated heuristics, keyed by QA run id + qa: Optional[Dict[str, PageQACompare]] = {} + + +# ============================================================================ +class PageOut(Page): + """Model for pages output, no QA""" + + status: Optional[int] = 200 + + +# ============================================================================ +class PageOutWithSingleQA(Page): + """Page out with single QA entry""" + + qa: Optional[PageQACompare] = None + + +# ============================================================================ +class PagesAndResources(BaseModel): + """moage for qa configmap data, pages + resources""" + + resources: List[CrawlFileOut] = [] + pages: List[PageOut] = [] diff --git a/backend/btrixcloud/operator/__init__.py b/backend/btrixcloud/operator/__init__.py new file mode 100644 index 0000000000..dd5f4830da --- /dev/null +++ b/backend/btrixcloud/operator/__init__.py @@ -0,0 +1,28 @@ +""" operators module """ + +from .profiles import ProfileOperator +from .bgjobs import BgJobOperator +from .cronjobs import CronJobOperator +from .crawls import CrawlOperator +from .baseoperator import K8sOpAPI + +operator_classes = [ProfileOperator, BgJobOperator, CronJobOperator, CrawlOperator] + + +# ============================================================================ +def init_operator_api(app, *args): + """registers webhook handlers for metacontroller""" + + k8s = K8sOpAPI() + + operators = [] + for cls in operator_classes: + oper = cls(k8s, *args) + oper.init_routes(app) + operators.append(oper) + + @app.get("/healthz", include_in_schema=False) + async def healthz(): + return {} + + return k8s diff --git a/backend/btrixcloud/operator/baseoperator.py b/backend/btrixcloud/operator/baseoperator.py new file mode 100644 index 0000000000..b06d8bf051 --- /dev/null +++ b/backend/btrixcloud/operator/baseoperator.py @@ -0,0 +1,150 @@ +""" Base Operator class for all operators """ + +import asyncio +from typing import TYPE_CHECKING +from kubernetes.utils import parse_quantity + +import yaml +from btrixcloud.k8sapi import K8sAPI + + +if TYPE_CHECKING: + from btrixcloud.crawlconfigs import CrawlConfigOps + from btrixcloud.crawls import CrawlOps + from btrixcloud.orgs import OrgOps + from btrixcloud.colls import CollectionOps + from btrixcloud.storages import StorageOps + from btrixcloud.webhooks import EventWebhookOps + from btrixcloud.users import UserManager + from btrixcloud.background_jobs import BackgroundJobOps + from btrixcloud.pages import PageOps + from redis.asyncio.client import Redis +else: + CrawlConfigOps = CrawlOps = OrgOps = CollectionOps = Redis = object + StorageOps = EventWebhookOps = UserManager = BackgroundJobOps = PageOps = object + + +# ============================================================================ +class K8sOpAPI(K8sAPI): + """Additional k8s api for operators""" + + def __init__(self): + super().__init__() + self.config_file = "/config/config.yaml" + with open(self.config_file, encoding="utf-8") as fh_config: + self.shared_params = yaml.safe_load(fh_config) + + self.has_pod_metrics = False + self.compute_crawler_resources() + self.compute_profile_resources() + + def compute_crawler_resources(self): + """compute memory / cpu resources for crawlers""" + p = self.shared_params + num = max(int(p["crawler_browser_instances"]) - 1, 0) + print("crawler resources") + if not p.get("crawler_cpu"): + base = parse_quantity(p["crawler_cpu_base"]) + extra = parse_quantity(p["crawler_extra_cpu_per_browser"]) + + # cpu is a floating value of cpu cores + p["crawler_cpu"] = float(base + num * extra) + + print(f"cpu = {base} + {num} * {extra} = {p['crawler_cpu']}") + else: + print(f"cpu = {p['crawler_cpu']}") + + if not p.get("crawler_memory"): + base = parse_quantity(p["crawler_memory_base"]) + extra = parse_quantity(p["crawler_extra_memory_per_browser"]) + + # memory is always an int + p["crawler_memory"] = int(base + num * extra) + + print(f"memory = {base} + {num} * {extra} = {p['crawler_memory']}") + else: + print(f"memory = {p['crawler_memory']}") + + def compute_profile_resources(self): + """compute memory /cpu resources for a single profile browser""" + p = self.shared_params + # if no profile specific options provided, default to crawler base for one browser + profile_cpu = parse_quantity( + p.get("profile_browser_cpu") or p["crawler_cpu_base"] + ) + profile_memory = parse_quantity( + p.get("profile_browser_memory") or p["crawler_memory_base"] + ) + p["profile_cpu"] = profile_cpu + p["profile_memory"] = profile_memory + + print("profile browser resources") + print(f"cpu = {profile_cpu}") + print(f"memory = {profile_memory}") + + async def async_init(self): + """perform any async init here""" + self.has_pod_metrics = await self.is_pod_metrics_available() + print("Pod Metrics Available:", self.has_pod_metrics) + + +# pylint: disable=too-many-instance-attributes, too-many-arguments +# ============================================================================ +class BaseOperator: + """BaseOperator""" + + k8s: K8sOpAPI + crawl_config_ops: CrawlConfigOps + crawl_ops: CrawlOps + orgs_ops: OrgOps + coll_ops: CollectionOps + storage_ops: StorageOps + event_webhook_ops: EventWebhookOps + background_job_ops: BackgroundJobOps + user_ops: UserManager + page_ops: PageOps + + def __init__( + self, + k8s, + crawl_config_ops, + crawl_ops, + org_ops, + coll_ops, + storage_ops, + event_webhook_ops, + background_job_ops, + page_ops, + ): + self.k8s = k8s + self.crawl_config_ops = crawl_config_ops + self.crawl_ops = crawl_ops + self.org_ops = org_ops + self.coll_ops = coll_ops + self.storage_ops = storage_ops + self.background_job_ops = background_job_ops + self.event_webhook_ops = event_webhook_ops + self.page_ops = page_ops + + self.user_ops = crawl_config_ops.user_manager + + # to avoid background tasks being garbage collected + # see: https://stackoverflow.com/a/74059981 + self.bg_tasks = set() + + def init_routes(self, app): + """init routes for this operator""" + + def run_task(self, func): + """add bg tasks to set to avoid premature garbage collection""" + task = asyncio.create_task(func) + self.bg_tasks.add(task) + task.add_done_callback(self.bg_tasks.discard) + + def load_from_yaml(self, filename, params): + """load and parse k8s template from yaml file""" + return list( + yaml.safe_load_all( + self.k8s.templates.env.get_template(filename).render(params) + ) + ) diff --git a/backend/btrixcloud/operator/bgjobs.py b/backend/btrixcloud/operator/bgjobs.py new file mode 100644 index 0000000000..fad3deea41 --- /dev/null +++ b/backend/btrixcloud/operator/bgjobs.py @@ -0,0 +1,62 @@ +""" Operator handler for BackgroundJobs """ + +from uuid import UUID +import traceback + +from btrixcloud.utils import ( + from_k8s_date, + dt_now, +) + +from .models import MCDecoratorSyncData +from .baseoperator import BaseOperator + + +# ============================================================================ +class BgJobOperator(BaseOperator): + """BgJobOperator""" + + def init_routes(self, app): + """init routes for this operator""" + + # nop, but needed for metacontroller + @app.post("/op/backgroundjob/sync") + async def mc_sync_background_jobs(): + return {"attachments": []} + + @app.post("/op/backgroundjob/finalize") + async def mc_finalize_background_jobs(data: MCDecoratorSyncData): + return await self.finalize_background_job(data) + + async def finalize_background_job(self, data: MCDecoratorSyncData) -> dict: + """handle finished background job""" + + metadata = data.object["metadata"] + labels: dict[str, str] = metadata.get("labels", {}) + oid: str = labels.get("btrix.org") or "" + job_type: str = labels.get("job_type") or "" + job_id: str = metadata.get("name") + + status = data.object["status"] + success = status.get("succeeded") == 1 + completion_time = status.get("completionTime") + + finalized = True + + finished = from_k8s_date(completion_time) if completion_time else dt_now() + + try: + await self.background_job_ops.job_finished( + job_id, job_type, UUID(oid), success=success, finished=finished + ) + # print( + # f"{job_type} background job completed: success: {success}, {job_id}", + # flush=True, + # ) + + # pylint: disable=broad-except + except Exception: + print("Update Background Job Error", flush=True) + traceback.print_exc() + + return {"attachments": [], "finalized": finalized} diff --git a/backend/btrixcloud/operator.py b/backend/btrixcloud/operator/crawls.py similarity index 61% rename from backend/btrixcloud/operator.py rename to backend/btrixcloud/operator/crawls.py index c90e6c8f98..c33afe1d59 100644 --- a/backend/btrixcloud/operator.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1,63 +1,51 @@ -""" btrixjob operator (working for metacontroller) """ +""" CrawlOperator """ -import asyncio import traceback import os from pprint import pprint -from typing import Optional, DefaultDict, TYPE_CHECKING - -from collections import defaultdict +from typing import Optional, Any +from datetime import datetime import json -from uuid import UUID -from fastapi import HTTPException -import yaml import humanize -from pydantic import BaseModel, Field - from kubernetes.utils import parse_quantity from redis import asyncio as exceptions -from .utils import ( - from_k8s_date, - to_k8s_date, - dt_now, -) -from .k8sapi import K8sAPI - -from .models import ( +from btrixcloud.models import ( NON_RUNNING_STATES, RUNNING_STATES, RUNNING_AND_STARTING_ONLY, RUNNING_AND_STARTING_STATES, SUCCESSFUL_STATES, + FAILED_STATES, + CrawlStats, CrawlFile, CrawlCompleteIn, StorageRef, + PagesAndResources, +) + +from btrixcloud.utils import ( + from_k8s_date, + to_k8s_date, + dt_now, +) + +from .baseoperator import BaseOperator, Redis +from .models import ( + CrawlSpec, + CrawlStatus, + MCBaseRequest, + MCSyncData, + POD, + CMAP, + PVC, + CJS, + BTRIX_API, ) -if TYPE_CHECKING: - from .crawlconfigs import CrawlConfigOps - from .crawls import CrawlOps - from .orgs import OrgOps - from .colls import CollectionOps - from .storages import StorageOps - from .webhooks import EventWebhookOps - from .users import UserManager - from .background_jobs import BackgroundJobOps - from redis.asyncio.client import Redis -else: - CrawlConfigOps = CrawlOps = OrgOps = CollectionOps = Redis = object - StorageOps = EventWebhookOps = UserManager = BackgroundJobOps = object - -CMAP = "ConfigMap.v1" -PVC = "PersistentVolumeClaim.v1" -POD = "Pod.v1" - -BTRIX_API = "btrix.cloud/v1" -CJS = f"CrawlJob.{BTRIX_API}" METRICS_API = "metrics.k8s.io/v1beta1" METRICS = f"PodMetrics.{METRICS_API}" @@ -73,309 +61,39 @@ EXEC_TIME_UPDATE_SECS = 60 +# pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements +# pylint: disable=invalid-name, too-many-lines, too-many-return-statements # ============================================================================ -class MCBaseRequest(BaseModel): - """base metacontroller model, used for customize hook""" - - parent: dict - controller: dict - - -# ============================================================================ -class MCSyncData(MCBaseRequest): - """sync / finalize metacontroller model""" - - children: dict - related: dict - finalizing: bool = False - - -# ============================================================================ -class MCDecoratorSyncData(BaseModel): - """sync for decoratorcontroller model""" - - object: dict - controller: dict - - attachments: dict - related: dict - finalizing: bool = False +class CrawlOperator(BaseOperator): + """CrawlOperator Handler""" - -# ============================================================================ -class CrawlSpec(BaseModel): - """spec from k8s CrawlJob object""" - - id: str - cid: UUID - oid: UUID - scale: int = 1 - storage: StorageRef - started: str - crawler_channel: str - stopping: bool = False - scheduled: bool = False - timeout: int = 0 - max_crawl_size: int = 0 - - -# ============================================================================ -class PodResourcePercentage(BaseModel): - """Resource usage percentage ratios""" - - memory: float = 0 - cpu: float = 0 - storage: float = 0 - - -# ============================================================================ -class PodResources(BaseModel): - """Pod Resources""" - - memory: int = 0 - cpu: float = 0 - storage: int = 0 - - def __init__(self, *a, **kw): - if "memory" in kw: - kw["memory"] = int(parse_quantity(kw["memory"])) - if "cpu" in kw: - kw["cpu"] = float(parse_quantity(kw["cpu"])) - if "storage" in kw: - kw["storage"] = int(parse_quantity(kw["storage"])) - super().__init__(*a, **kw) - - -# ============================================================================ -class PodInfo(BaseModel): - """Aggregate pod status info held in CrawlJob""" - - exitTime: Optional[str] = None - exitCode: Optional[int] = None - isNewExit: Optional[bool] = Field(default=None, exclude=True) - reason: Optional[str] = None - - allocated: PodResources = PodResources() - used: PodResources = PodResources() - - newCpu: Optional[int] = None - newMemory: Optional[int] = None - - def dict(self, *a, **kw): - res = super().dict(*a, **kw) - percent = { - "memory": self.get_percent_memory(), - "cpu": self.get_percent_cpu(), - "storage": self.get_percent_storage(), - } - res["percent"] = percent - return res - - def get_percent_memory(self) -> float: - """compute percent memory used""" - return ( - float(self.used.memory) / float(self.allocated.memory) - if self.allocated.memory - else 0 - ) - - def get_percent_cpu(self) -> float: - """compute percent cpu used""" - return ( - float(self.used.cpu) / float(self.allocated.cpu) - if self.allocated.cpu - else 0 - ) - - def get_percent_storage(self) -> float: - """compute percent storage used""" - return ( - float(self.used.storage) / float(self.allocated.storage) - if self.allocated.storage - else 0 - ) - - def should_restart_pod(self): - """return true if pod should be restarted""" - if self.newMemory and self.newMemory != self.allocated.memory: - return True - - if self.newCpu and self.newCpu != self.allocated.cpu: - return True - - return False - - -# ============================================================================ -class CrawlStatus(BaseModel): - """status from k8s CrawlJob object""" - - state: str = "starting" - pagesFound: int = 0 - pagesDone: int = 0 - size: int = 0 - # human readable size string - sizeHuman: str = "" - scale: int = 1 - filesAdded: int = 0 - filesAddedSize: int = 0 - finished: Optional[str] = None - stopping: bool = False - stopReason: Optional[str] = None - initRedis: bool = False - crawlerImage: Optional[str] = None - lastActiveTime: str = "" - podStatus: Optional[DefaultDict[str, PodInfo]] = defaultdict( - lambda: PodInfo() # pylint: disable=unnecessary-lambda - ) - # placeholder for pydantic 2.0 -- will require this version - # podStatus: Optional[ - # DefaultDict[str, Annotated[PodInfo, Field(default_factory=PodInfo)]] - # ] - restartTime: Optional[str] - canceled: bool = False - - # updated on pod exits and at regular interval - # Crawl Execution Time -- time all crawler pods have been running - # used to track resource usage and enforce execution minutes limit - crawlExecTime: int = 0 - - # Elapsed Exec Time -- time crawl has been running in at least one pod - # used for crawl timeouts - elapsedCrawlTime: int = 0 - - # last exec time update - lastUpdatedTime: str = "" - - # any pods exited - anyCrawlPodNewExit: Optional[bool] = Field(default=False, exclude=True) - - # don't include in status, use by metacontroller - resync_after: Optional[int] = Field(default=None, exclude=True) - - -# ============================================================================ -# pylint: disable=too-many-statements, too-many-public-methods, too-many-branches, too-many-nested-blocks -# pylint: disable=too-many-instance-attributes, too-many-locals, too-many-lines, too-many-arguments -class BtrixOperator(K8sAPI): - """BtrixOperator Handler""" - - crawl_config_ops: CrawlConfigOps - crawl_ops: CrawlOps - orgs_ops: OrgOps - coll_ops: CollectionOps - storage_ops: StorageOps - event_webhook_ops: EventWebhookOps - background_job_ops: BackgroundJobOps - user_ops: UserManager - - def __init__( - self, - crawl_config_ops, - crawl_ops, - org_ops, - coll_ops, - storage_ops, - event_webhook_ops, - background_job_ops, - ): - super().__init__() - - self.crawl_config_ops = crawl_config_ops - self.crawl_ops = crawl_ops - self.org_ops = org_ops - self.coll_ops = coll_ops - self.storage_ops = storage_ops - self.background_job_ops = background_job_ops - self.event_webhook_ops = event_webhook_ops - - self.user_ops = crawl_config_ops.user_manager - - self.config_file = "/config/config.yaml" + def __init__(self, *args): + super().__init__(*args) self.done_key = "crawls-done" + self.pages_key = "pages" + self.errors_key = "e" self.fast_retry_secs = int(os.environ.get("FAST_RETRY_SECS") or 0) self.log_failed_crawl_lines = int(os.environ.get("LOG_FAILED_CRAWL_LINES") or 0) - with open(self.config_file, encoding="utf-8") as fh_config: - self.shared_params = yaml.safe_load(fh_config) - - self._has_pod_metrics = False - self.compute_crawler_resources() - - # to avoid background tasks being garbage collected - # see: https://stackoverflow.com/a/74059981 - self.bg_tasks = set() - - def compute_crawler_resources(self): - """compute memory / cpu resources for crawlers""" - # pylint: disable=invalid-name - p = self.shared_params - num = max(int(p["crawler_browser_instances"]) - 1, 0) - if not p.get("crawler_cpu"): - base = parse_quantity(p["crawler_cpu_base"]) - extra = parse_quantity(p["crawler_extra_cpu_per_browser"]) - - # cpu is a floating value of cpu cores - p["crawler_cpu"] = float(base + num * extra) + def init_routes(self, app): + """init routes for this operator""" - print(f"cpu = {base} + {num} * {extra} = {p['crawler_cpu']}") - else: - print(f"cpu = {p['crawler_cpu']}") - - if not p.get("crawler_memory"): - base = parse_quantity(p["crawler_memory_base"]) - extra = parse_quantity(p["crawler_extra_memory_per_browser"]) - - # memory is always an int - p["crawler_memory"] = int(base + num * extra) - - print(f"memory = {base} + {num} * {extra} = {p['crawler_memory']}") - else: - print(f"memory = {p['crawler_memory']}") - - async def async_init(self): - """perform any async init here""" - self._has_pod_metrics = await self.is_pod_metrics_available() - print("Pod Metrics Available:", self._has_pod_metrics) - - async def sync_profile_browsers(self, data: MCSyncData): - """sync profile browsers""" - spec = data.parent.get("spec", {}) - - expire_time = from_k8s_date(spec.get("expireTime")) - browserid = spec.get("id") - - if dt_now() >= expire_time: - self.run_task(self.delete_profile_browser(browserid)) - return {"status": {}, "children": []} - - params = {} - params.update(self.shared_params) - params["id"] = browserid - params["userid"] = spec.get("userid", "") - - oid = spec.get("oid") - storage = StorageRef(spec.get("storageName")) - - storage_path = storage.get_storage_extra_path(oid) - storage_secret = storage.get_storage_secret_name(oid) - - params["storage_path"] = storage_path - params["storage_secret"] = storage_secret - params["profile_filename"] = spec.get("profileFilename", "") - params["crawler_image"] = spec["crawlerImage"] - - params["url"] = spec.get("startUrl", "about:blank") - params["vnc_password"] = spec.get("vncPassword") + @app.post("/op/crawls/sync") + async def mc_sync_crawls(data: MCSyncData): + return await self.sync_crawls(data) - children = self.load_from_yaml("profilebrowser.yaml", params) + # reuse sync path, but distinct endpoint for better logging + @app.post("/op/crawls/finalize") + async def mc_sync_finalize(data: MCSyncData): + return await self.sync_crawls(data) - return {"status": {}, "children": children} + @app.post("/op/crawls/customize") + async def mc_related(data: MCBaseRequest): + return self.get_related(data) - # pylint: disable=too-many-return-statements, invalid-name async def sync_crawls(self, data: MCSyncData): """sync crawls""" @@ -386,31 +104,43 @@ async def sync_crawls(self, data: MCSyncData): cid = spec["cid"] oid = spec["oid"] - redis_url = self.get_redis_url(crawl_id) + redis_url = self.k8s.get_redis_url(crawl_id) params = {} - params.update(self.shared_params) + params.update(self.k8s.shared_params) params["id"] = crawl_id params["cid"] = cid params["userid"] = spec.get("userid", "") pods = data.children[POD] + crawl = CrawlSpec( + id=crawl_id, + cid=cid, + oid=oid, + storage=StorageRef(spec["storageName"]), + crawler_channel=spec.get("crawlerChannel"), + scale=spec.get("scale", 1), + started=data.parent["metadata"]["creationTimestamp"], + stopping=spec.get("stopping", False), + timeout=spec.get("timeout") or 0, + max_crawl_size=int(spec.get("maxCrawlSize") or 0), + scheduled=spec.get("manual") != "1", + qa_source_crawl_id=spec.get("qaSourceCrawlId"), + ) + # if finalizing, crawl is being deleted if data.finalizing: if not status.finished: # if can't cancel, already finished - if not await self.cancel_crawl( - crawl_id, UUID(cid), UUID(oid), status, data.children[POD] - ): - # instead of fetching the state (that was already set) - # return exception to ignore this request, keep previous - # finished state - raise HTTPException(status_code=400, detail="out_of_sync_status") + await self.cancel_crawl(crawl, status, data.children[POD]) + # instead of fetching the state (that was already set) + # return exception to ignore this request, keep previous + # finished state + # raise HTTPException(status_code=400, detail="out_of_sync_status") return await self.finalize_response( - crawl_id, - UUID(oid), + crawl, status, spec, data.children, @@ -423,10 +153,9 @@ async def sync_crawls(self, data: MCSyncData): print( f"warn crawl {crawl_id} finished but not deleted, post-finish taking too long?" ) - self.run_task(self.delete_crawl_job(crawl_id)) + self.run_task(self.k8s.delete_crawl_job(crawl.id)) return await self.finalize_response( - crawl_id, - UUID(oid), + crawl, status, spec, data.children, @@ -438,28 +167,14 @@ async def sync_crawls(self, data: MCSyncData): # pylint: disable=bare-except, broad-except except: # fail crawl if config somehow missing, shouldn't generally happen - await self.fail_crawl(crawl_id, UUID(cid), UUID(oid), status, pods) + await self.fail_crawl(crawl, status, pods) return self._empty_response(status) - crawl = CrawlSpec( - id=crawl_id, - cid=cid, - oid=oid, - storage=StorageRef(spec["storageName"]), - crawler_channel=spec.get("crawlerChannel"), - scale=spec.get("scale", 1), - started=data.parent["metadata"]["creationTimestamp"], - stopping=spec.get("stopping", False), - timeout=spec.get("timeout") or 0, - max_crawl_size=int(spec.get("maxCrawlSize") or 0), - scheduled=spec.get("manual") != "1", - ) - # shouldn't get here, crawl should already be finalizing when canceled # just in case, handle canceled-but-not-finalizing here if status.state == "canceled": - await self.delete_crawl_job(crawl.id) + await self.k8s.delete_crawl_job(crawl.id) return {"status": status.dict(exclude_none=True), "children": []} # first, check storage quota, and fail immediately if quota reached @@ -471,9 +186,7 @@ async def sync_crawls(self, data: MCSyncData): and not data.children[PVC] and await self.org_ops.storage_quota_reached(crawl.oid) ): - await self.mark_finished( - crawl.id, crawl.cid, crawl.oid, status, "skipped_quota_reached" - ) + await self.mark_finished(crawl, status, "skipped_quota_reached") return self._empty_response(status) if status.state in ("starting", "waiting_org_limit"): @@ -481,7 +194,7 @@ async def sync_crawls(self, data: MCSyncData): return self._empty_response(status) await self.set_state( - "starting", status, crawl.id, allowed_from=["waiting_org_limit"] + "starting", status, crawl, allowed_from=["waiting_org_limit"] ) if len(pods): @@ -501,8 +214,7 @@ async def sync_crawls(self, data: MCSyncData): if status.finished: return await self.finalize_response( - crawl_id, - UUID(oid), + crawl, status, spec, data.children, @@ -510,21 +222,29 @@ async def sync_crawls(self, data: MCSyncData): ) await self.increment_pod_exec_time( - pods, status, crawl.id, crawl.oid, EXEC_TIME_UPDATE_SECS + pods, crawl, status, EXEC_TIME_UPDATE_SECS ) else: status.scale = crawl.scale - status.lastUpdatedTime = to_k8s_date(dt_now()) + now = dt_now() + await self.crawl_ops.inc_crawl_exec_time( + crawl.db_crawl_id, crawl.is_qa, 0, now + ) + status.lastUpdatedTime = to_k8s_date(now) children = self._load_redis(params, status, data.children) storage_path = crawl.storage.get_storage_extra_path(oid) storage_secret = crawl.storage.get_storage_secret_name(oid) + if not crawl.is_qa: + params["profile_filename"] = configmap["PROFILE_FILENAME"] + else: + storage_path += "qa/" + params["storage_path"] = storage_path params["storage_secret"] = storage_secret - params["profile_filename"] = configmap["PROFILE_FILENAME"] # only resolve if not already set # not automatically updating image for existing crawls @@ -538,6 +258,8 @@ async def sync_crawls(self, data: MCSyncData): params["storage_filename"] = configmap["STORE_FILENAME"] params["restart_time"] = spec.get("restartTime") + params["warc_prefix"] = spec.get("warcPrefix") + params["redis_url"] = redis_url if spec.get("restartTime") != status.restartTime: @@ -548,6 +270,10 @@ async def sync_crawls(self, data: MCSyncData): else: params["force_restart"] = False + if crawl.qa_source_crawl_id: + params["qa_source_crawl_id"] = crawl.qa_source_crawl_id + children.extend(await self._load_qa_configmap(params, data.children)) + for i in range(0, status.scale): children.extend(self._load_crawler(params, i, status, data.children)) @@ -573,6 +299,25 @@ def _load_redis(self, params, status, children): return self.load_from_yaml("redis.yaml", params) + async def _load_qa_configmap(self, params, children): + qa_source_crawl_id = params["qa_source_crawl_id"] + name = f"qa-replay-{qa_source_crawl_id}" + + if name in children[CMAP]: + return [children[CMAP][name]] + + pages, _ = await self.page_ops.list_pages(qa_source_crawl_id, page_size=1000) + + crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id) + + res_and_pages = PagesAndResources(resources=crawl_replay.resources, pages=pages) + + params["name"] = name + params["qa_source_replay_json"] = res_and_pages.json() + # params["qa_source_replay_json"] = crawl_replay.json(include={"resources"}) + + return self.load_from_yaml("qa_configmap.yaml", params) + def _load_crawler(self, params, i, status, children): name = f"crawl-{params['id']}-{i}" has_pod = name in children[POD] @@ -587,7 +332,10 @@ def _load_crawler(self, params, i, status, children): if params.get("do_restart"): print(f"Restart {name}") - params["priorityClassName"] = f"crawl-instance-{i}" + if params.get("qa_source_crawl_id"): + params["priorityClassName"] = f"qa-crawl-instance-{i}" + else: + params["priorityClassName"] = f"crawl-instance-{i}" return self.load_from_yaml("crawler.yaml", params) @@ -661,7 +409,15 @@ def sync_resources(self, status, name, pod, children): src = pvc["spec"]["resources"]["requests"] resources.storage = int(parse_quantity(src.get("storage"))) - async def set_state(self, state, status, crawl_id, allowed_from, **kwargs): + async def set_state( + self, + state: str, + status: CrawlStatus, + crawl: CrawlSpec, + allowed_from: list[str], + finished: Optional[datetime] = None, + stats: Optional[CrawlStats] = None, + ): """set status state and update db, if changed if allowed_from passed in, can only transition from allowed_from state, otherwise get current state from db and return @@ -688,15 +444,22 @@ async def set_state(self, state, status, crawl_id, allowed_from, **kwargs): """ if not allowed_from or status.state in allowed_from: res = await self.crawl_ops.update_crawl_state_if_allowed( - crawl_id, state=state, allowed_from=allowed_from, **kwargs + crawl.db_crawl_id, + crawl.is_qa, + state=state, + allowed_from=allowed_from, + finished=finished, + stats=stats, ) if res: - print(f"Setting state: {status.state} -> {state}, {crawl_id}") + print(f"Setting state: {status.state} -> {state}, {crawl.id}") status.state = state return True # get actual crawl state - actual_state, finished = await self.crawl_ops.get_crawl_state(crawl_id) + actual_state, finished = await self.crawl_ops.get_crawl_state( + crawl.db_crawl_id, crawl.is_qa + ) if actual_state: status.state = actual_state if finished: @@ -709,16 +472,10 @@ async def set_state(self, state, status, crawl_id, allowed_from, **kwargs): if status.state != state: print( - f"Not setting state: {status.state} -> {state}, {crawl_id} not allowed" + f"Not setting state: {status.state} -> {state}, {crawl.id} not allowed" ) return False - def load_from_yaml(self, filename, params): - """load and parse k8s template from yaml file""" - return list( - yaml.safe_load_all(self.templates.env.get_template(filename).render(params)) - ) - def get_related(self, data: MCBaseRequest): """return objects related to crawl pods""" spec = data.parent.get("spec", {}) @@ -738,7 +495,7 @@ def get_related(self, data: MCBaseRequest): }, ] - if self._has_pod_metrics: + if self.k8s.has_pod_metrics: related_resources.append( { "apiVersion": METRICS_API, @@ -782,23 +539,21 @@ async def can_start_new(self, crawl: CrawlSpec, data: MCSyncData, status): i += 1 await self.set_state( - "waiting_org_limit", status, crawl.id, allowed_from=["starting"] + "waiting_org_limit", status, crawl, allowed_from=["starting"] ) return False async def cancel_crawl( self, - crawl_id: str, - cid: UUID, - oid: UUID, + crawl: CrawlSpec, status: CrawlStatus, pods: dict, ) -> bool: """Mark crawl as canceled""" - if not await self.mark_finished(crawl_id, cid, oid, status, "canceled"): + if not await self.mark_finished(crawl, status, "canceled"): return False - await self.mark_for_cancelation(crawl_id) + await self.mark_for_cancelation(crawl.id) if not status.canceled: for name, pod in pods.items(): @@ -823,19 +578,15 @@ async def cancel_crawl( async def fail_crawl( self, - crawl_id: str, - cid: UUID, - oid: UUID, + crawl: CrawlSpec, status: CrawlStatus, pods: dict, - stats=None, + stats: Optional[CrawlStats] = None, ) -> bool: """Mark crawl as failed, log crawl state and print crawl logs, if possible""" prev_state = status.state - if not await self.mark_finished( - crawl_id, cid, oid, status, "failed", stats=stats - ): + if not await self.mark_finished(crawl, status, "failed", stats=stats): return False if not self.log_failed_crawl_lines or prev_state == "failed": @@ -847,7 +598,7 @@ async def fail_crawl( print(f"============== POD STATUS: {name} ==============") pprint(pods[name]["status"]) - self.run_task(self.print_pod_logs(pod_names, self.log_failed_crawl_lines)) + self.run_task(self.k8s.print_pod_logs(pod_names, self.log_failed_crawl_lines)) return True @@ -860,8 +611,7 @@ def _empty_response(self, status): async def finalize_response( self, - crawl_id: str, - oid: UUID, + crawl: CrawlSpec, status: CrawlStatus, spec: dict, children: dict, @@ -869,7 +619,7 @@ async def finalize_response( ): """ensure crawl id ready for deletion""" - redis_pod = f"redis-{crawl_id}" + redis_pod = f"redis-{crawl.id}" new_children = [] finalized = False @@ -880,7 +630,7 @@ async def finalize_response( # if has other pods, keep redis pod until they are removed if len(pods) > 1: new_children = self._load_redis(params, status, children) - await self.increment_pod_exec_time(pods, status, crawl_id, oid) + await self.increment_pod_exec_time(pods, crawl, status) # keep pvs until pods are removed if new_children: @@ -891,12 +641,15 @@ async def finalize_response( if status.finished: ttl = spec.get("ttlSecondsAfterFinished", DEFAULT_TTL) finished = from_k8s_date(status.finished) - if (dt_now() - finished).total_seconds() > ttl > 0: - print("CrawlJob expired, deleting: " + crawl_id) + if (dt_now() - finished).total_seconds() > ttl >= 0: + print("CrawlJob expired, deleting: " + crawl.id) finalized = True else: finalized = True + if finalized and crawl.is_qa: + await self.crawl_ops.qa_run_finished(crawl.db_crawl_id) + return { "status": status.dict(exclude_none=True), "children": new_children, @@ -907,7 +660,7 @@ async def _get_redis(self, redis_url: str) -> Optional[Redis]: """init redis, ensure connectivity""" redis = None try: - redis = await self.get_redis_client(redis_url) + redis = await self.k8s.get_redis_client(redis_url) # test connection await redis.ping() return redis @@ -942,34 +695,36 @@ async def sync_crawl_state( if status.anyCrawlPodNewExit: await self.log_crashes(crawl.id, status.podStatus, redis) - if not crawler_running: + if not crawler_running or not redis: + # if either crawler is not running or redis is inaccessible if self.should_mark_waiting(status.state, crawl.started): + # mark as waiting (if already running) await self.set_state( "waiting_capacity", status, - crawl.id, + crawl, allowed_from=RUNNING_AND_STARTING_ONLY, ) - # for now, don't reset redis once inited - if status.lastActiveTime and ( - (dt_now() - from_k8s_date(status.lastActiveTime)).total_seconds() - > REDIS_TTL - ): - print( - f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs" - ) - status.initRedis = False - - # if still running, resync after N seconds - status.resync_after = self.fast_retry_secs - return status - - status.initRedis = True - status.lastActiveTime = to_k8s_date(dt_now()) + if not crawler_running and redis: + # if crawler running, but no redis, stop redis instance until crawler + # is running + if status.lastActiveTime and ( + ( + dt_now() - from_k8s_date(status.lastActiveTime) + ).total_seconds() + > REDIS_TTL + ): + print( + f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs" + ) + status.initRedis = False + elif crawler_running and not redis: + # if crawler is running, but no redis, init redis + status.initRedis = True + status.lastActiveTime = to_k8s_date(dt_now()) - if not redis: - # if still running, resync after N seconds + # if no crawler / no redis, resync after N seconds status.resync_after = self.fast_retry_secs return status @@ -979,7 +734,7 @@ async def sync_crawl_state( if await self.set_state( "running", status, - crawl.id, + crawl, allowed_from=["starting", "waiting_capacity"], ): self.run_task( @@ -989,7 +744,6 @@ async def sync_crawl_state( ) file_done = await redis.lpop(self.done_key) - while file_done: msg = json.loads(file_done) # add completed file @@ -1000,6 +754,25 @@ async def sync_crawl_state( # get next file done file_done = await redis.lpop(self.done_key) + page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}") + qa_run_id = crawl.id if crawl.is_qa else None + + while page_crawled: + print("PAGE DATA", flush=True) + print(page_crawled, flush=True) + page_dict = json.loads(page_crawled) + await self.page_ops.add_page_to_db( + page_dict, crawl.db_crawl_id, qa_run_id, crawl.oid + ) + page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}") + + crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}") + while crawl_error: + await self.crawl_ops.add_crawl_error( + crawl.db_crawl_id, crawl.is_qa, crawl_error + ) + crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}") + # ensure filesAdded and filesAddedSize always set status.filesAdded = int(await redis.get("filesAdded") or 0) status.filesAddedSize = int(await redis.get("filesAddedSize") or 0) @@ -1095,20 +868,24 @@ def handle_terminated_pod(self, name, role, status, terminated): async def increment_pod_exec_time( self, pods: dict[str, dict], + crawl: CrawlSpec, status: CrawlStatus, - crawl_id: str, - oid: UUID, min_duration=0, ) -> None: """inc exec time tracking""" now = dt_now() - if not status.lastUpdatedTime: + update_start_time = await self.crawl_ops.get_crawl_exec_last_update_time( + crawl.db_crawl_id + ) + + if not update_start_time: + await self.crawl_ops.inc_crawl_exec_time( + crawl.db_crawl_id, crawl.is_qa, 0, now + ) status.lastUpdatedTime = to_k8s_date(now) return - update_start_time = from_k8s_date(status.lastUpdatedTime) - reason = None update_duration = (now - update_start_time).total_seconds() @@ -1182,8 +959,9 @@ async def increment_pod_exec_time( max_duration = max(duration, max_duration) if exec_time: - await self.crawl_ops.inc_crawl_exec_time(crawl_id, exec_time) - await self.org_ops.inc_org_time_stats(oid, exec_time, True) + await self.org_ops.inc_org_time_stats( + crawl.oid, exec_time, True, crawl.is_qa + ) status.crawlExecTime += exec_time status.elapsedCrawlTime += max_duration @@ -1192,6 +970,9 @@ async def increment_pod_exec_time( flush=True, ) + await self.crawl_ops.inc_crawl_exec_time( + crawl.db_crawl_id, crawl.is_qa, exec_time, now + ) status.lastUpdatedTime = to_k8s_date(now) def should_mark_waiting(self, state, started): @@ -1216,7 +997,7 @@ async def add_used_stats(self, crawl_id, pod_status, redis, metrics): pod_info.used.storage = storage # if no pod metrics, get memory estimate from redis itself - if not self._has_pod_metrics: + if not self.k8s.has_pod_metrics: stats = await redis.info("memory") pod_info.used.memory = int(stats.get("used_memory_rss", 0)) @@ -1292,7 +1073,13 @@ async def add_file_to_crawl(self, cc_data, crawl, redis): await redis.incr("filesAddedSize", filecomplete.size) - await self.crawl_ops.add_crawl_file(crawl.id, crawl_file, filecomplete.size) + await self.crawl_ops.add_crawl_file( + crawl.db_crawl_id, crawl.is_qa, crawl_file, filecomplete.size + ) + + # no replicas for QA for now + if crawl.is_qa: + return True try: await self.background_job_ops.create_replica_jobs( @@ -1336,7 +1123,9 @@ async def is_crawl_stopping( return None - async def get_redis_crawl_stats(self, redis: Redis, crawl_id: str): + async def get_redis_crawl_stats( + self, redis: Redis, crawl_id: str + ) -> tuple[CrawlStats, dict[str, Any]]: """get page stats""" try: # crawler >0.9.0, done key is a value @@ -1349,7 +1138,7 @@ async def get_redis_crawl_stats(self, redis: Redis, crawl_id: str): sizes = await redis.hgetall(f"{crawl_id}:size") archive_size = sum(int(x) for x in sizes.values()) - stats = {"found": pages_found, "done": pages_done, "size": archive_size} + stats = CrawlStats(found=pages_found, done=pages_done, size=archive_size) return stats, sizes async def update_crawl_state( @@ -1365,15 +1154,17 @@ async def update_crawl_state( stats, sizes = await self.get_redis_crawl_stats(redis, crawl.id) # need to add size of previously completed WACZ files as well! - stats["size"] += status.filesAddedSize + stats.size += status.filesAddedSize # update status - status.pagesDone = stats["done"] - status.pagesFound = stats["found"] - status.size = stats["size"] + status.pagesDone = stats.done + status.pagesFound = stats.found + status.size = stats.size status.sizeHuman = humanize.naturalsize(status.size) - await self.crawl_ops.update_running_crawl_stats(crawl.id, stats) + await self.crawl_ops.update_running_crawl_stats( + crawl.db_crawl_id, crawl.is_qa, stats + ) for key, value in sizes.items(): value = int(value) @@ -1409,9 +1200,7 @@ async def update_crawl_state( # check if one-page crawls actually succeeded # if only one page found, and no files, assume failed if status.pagesFound == 1 and not status.filesAdded: - await self.fail_crawl( - crawl.id, crawl.cid, crawl.oid, status, pods, stats - ) + await self.fail_crawl(crawl, status, pods, stats) return status if status.stopReason in ("stopped_by_user", "stopped_quota_reached"): @@ -1419,21 +1208,15 @@ async def update_crawl_state( else: state = "complete" - await self.mark_finished( - crawl.id, crawl.cid, crawl.oid, status, state, crawl, stats - ) + await self.mark_finished(crawl, status, state, stats) # check if all crawlers failed elif status_count.get("failed", 0) >= crawl.scale: # if stopping, and no pages finished, mark as canceled if status.stopping and not status.pagesDone: - await self.mark_finished( - crawl.id, crawl.cid, crawl.oid, status, "canceled", crawl, stats - ) + await self.mark_finished(crawl, status, "canceled", stats) else: - await self.fail_crawl( - crawl.id, crawl.cid, crawl.oid, status, pods, stats - ) + await self.fail_crawl(crawl, status, pods, stats) # check for other statuses else: @@ -1450,7 +1233,7 @@ async def update_crawl_state( new_status = "pending-wait" if new_status: await self.set_state( - new_status, status, crawl.id, allowed_from=RUNNING_STATES + new_status, status, crawl, allowed_from=RUNNING_STATES ) return status @@ -1458,22 +1241,15 @@ async def update_crawl_state( # pylint: disable=too-many-arguments async def mark_finished( self, - crawl_id: str, - cid: UUID, - oid: UUID, + crawl: CrawlSpec, status: CrawlStatus, state: str, - crawl=None, - stats=None, + stats: Optional[CrawlStats] = None, ) -> bool: """mark crawl as finished, set finished timestamp and final state""" finished = dt_now() - kwargs = {"finished": finished} - if stats: - kwargs["stats"] = stats - if state in SUCCESSFUL_STATES: allowed_from = RUNNING_STATES else: @@ -1481,7 +1257,12 @@ async def mark_finished( # if set_state returns false, already set to same status, return if not await self.set_state( - state, status, crawl_id, allowed_from=allowed_from, **kwargs + state, + status, + crawl, + allowed_from=allowed_from, + finished=finished, + stats=stats, ): print("already finished, ignoring mark_finished") if not status.finished: @@ -1491,44 +1272,63 @@ async def mark_finished( status.finished = to_k8s_date(finished) - if crawl and state in SUCCESSFUL_STATES: + if state in SUCCESSFUL_STATES: await self.inc_crawl_complete_stats(crawl, finished) - self.run_task( - self.do_crawl_finished_tasks( - crawl_id, cid, oid, status.filesAddedSize, state - ) - ) + # Regular Crawl Finished + if not crawl.is_qa: + self.run_task(self.do_crawl_finished_tasks(crawl, status, state)) + + # QA Run Finished + else: + self.run_task(self.do_qa_run_finished_tasks(crawl, state)) return True # pylint: disable=too-many-arguments async def do_crawl_finished_tasks( self, - crawl_id: str, - cid: UUID, - oid: UUID, - files_added_size: int, + crawl: CrawlSpec, + status: CrawlStatus, state: str, ) -> None: """Run tasks after crawl completes in asyncio.task coroutine.""" - await self.crawl_config_ops.stats_recompute_last(cid, files_added_size, 1) + await self.crawl_config_ops.stats_recompute_last( + crawl.cid, status.filesAddedSize, 1 + ) + + if state in SUCCESSFUL_STATES and crawl.oid: + await self.org_ops.inc_org_bytes_stored( + crawl.oid, status.filesAddedSize, "crawl" + ) + await self.coll_ops.add_successful_crawl_to_collections(crawl.id, crawl.cid) - if state in SUCCESSFUL_STATES and oid: - await self.org_ops.inc_org_bytes_stored(oid, files_added_size, "crawl") - await self.coll_ops.add_successful_crawl_to_collections(crawl_id, cid) + if state in FAILED_STATES: + await self.crawl_ops.delete_crawl_files(crawl.id, crawl.oid) + await self.page_ops.delete_crawl_pages(crawl.id, crawl.oid) await self.event_webhook_ops.create_crawl_finished_notification( - crawl_id, oid, state + crawl.id, crawl.oid, state ) - # add crawl errors to db - await self.add_crawl_errors_to_db(crawl_id) + # finally, delete job + await self.k8s.delete_crawl_job(crawl.id) + + # pylint: disable=too-many-arguments + async def do_qa_run_finished_tasks( + self, + crawl: CrawlSpec, + state: str, + ) -> None: + """Run tasks after qa run completes in asyncio.task coroutine.""" + + if state in FAILED_STATES: + await self.page_ops.delete_qa_run_from_pages(crawl.db_crawl_id, crawl.id) # finally, delete job - await self.delete_crawl_job(crawl_id) + await self.k8s.delete_crawl_job(crawl.id) - async def inc_crawl_complete_stats(self, crawl, finished): + async def inc_crawl_complete_stats(self, crawl: CrawlSpec, finished: datetime): """Increment Crawl Stats""" started = from_k8s_date(crawl.started) @@ -1537,12 +1337,12 @@ async def inc_crawl_complete_stats(self, crawl, finished): print(f"Duration: {duration}", flush=True) - await self.org_ops.inc_org_time_stats(crawl.oid, duration) + await self.org_ops.inc_org_time_stats(crawl.oid, duration, False, crawl.is_qa) async def mark_for_cancelation(self, crawl_id): """mark crawl as canceled in redis""" try: - redis_url = self.get_redis_url(crawl_id) + redis_url = self.k8s.get_redis_url(crawl_id) redis = await self._get_redis(redis_url) if not redis: return False @@ -1552,220 +1352,3 @@ async def mark_for_cancelation(self, crawl_id): finally: if redis: await redis.close() - - async def add_crawl_errors_to_db(self, crawl_id, inc=100): - """Pull crawl errors from redis and write to mongo db""" - index = 0 - redis = None - try: - redis_url = self.get_redis_url(crawl_id) - redis = await self._get_redis(redis_url) - if not redis: - return - - # ensure this only runs once - if not await redis.setnx("errors-exported", "1"): - return - - while True: - skip = index * inc - upper_bound = skip + inc - 1 - errors = await redis.lrange(f"{crawl_id}:e", skip, upper_bound) - if not errors: - break - - await self.crawl_ops.add_crawl_errors(crawl_id, errors) - - if len(errors) < inc: - # If we have fewer than inc errors, we can assume this is the - # last page of data to add. - break - index += 1 - # pylint: disable=bare-except - except: - # likely redis has already been deleted, so nothing to do - pass - finally: - if redis: - await redis.close() - - def get_cronjob_crawl_related(self, data: MCBaseRequest): - """return configmap related to crawl""" - labels = data.parent.get("metadata", {}).get("labels", {}) - cid = labels.get("btrix.crawlconfig") - return { - "relatedResources": [ - { - "apiVersion": "v1", - "resource": "configmaps", - "labelSelector": {"matchLabels": {"btrix.crawlconfig": cid}}, - } - ] - } - - async def sync_cronjob_crawl(self, data: MCDecoratorSyncData): - """create crawljobs from a job object spawned by cronjob""" - - metadata = data.object["metadata"] - labels = metadata.get("labels", {}) - cid = labels.get("btrix.crawlconfig") - - name = metadata.get("name") - crawl_id = name - - actual_state, finished = await self.crawl_ops.get_crawl_state(crawl_id) - if finished: - status = None - # mark job as completed - if not data.object["status"].get("succeeded"): - print("Cron Job Complete!", finished) - status = { - "succeeded": 1, - "startTime": metadata.get("creationTimestamp"), - "completionTime": to_k8s_date(finished), - } - - return { - "attachments": [], - "annotations": {"finished": finished}, - "status": status, - } - - configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"] - - oid = configmap.get("ORG_ID") - userid = configmap.get("USER_ID") - - crawljobs = data.attachments[CJS] - - org = await self.org_ops.get_org_by_id(UUID(oid)) - - crawl_id, crawljob = self.new_crawl_job_yaml( - cid, - userid=userid, - oid=oid, - storage=org.storage, - crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"), - scale=int(configmap.get("INITIAL_SCALE", 1)), - crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)), - max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")), - manual=False, - crawl_id=crawl_id, - ) - - attachments = list(yaml.safe_load_all(crawljob)) - - if crawl_id in crawljobs: - attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"] - - if not actual_state: - # pylint: disable=duplicate-code - crawlconfig = await self.crawl_config_ops.get_crawl_config( - UUID(cid), UUID(oid) - ) - if not crawlconfig: - print( - f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?" - ) - return {"attachments": []} - - # db create - user = await self.user_ops.get_by_id(UUID(userid)) - if not user: - print(f"error: missing user for id {userid}") - return {"attachments": []} - - await self.crawl_config_ops.add_new_crawl( - crawl_id, crawlconfig, user, manual=False - ) - print("Scheduled Crawl Created: " + crawl_id) - - return { - "attachments": attachments, - } - - async def finalize_background_job(self, data: MCDecoratorSyncData) -> dict: - """handle finished background job""" - - metadata = data.object["metadata"] - labels: dict[str, str] = metadata.get("labels", {}) - oid: str = labels.get("btrix.org") or "" - job_type: str = labels.get("job_type") or "" - job_id: str = metadata.get("name") - - status = data.object["status"] - success = status.get("succeeded") == 1 - completion_time = status.get("completionTime") - - finalized = True - - finished = from_k8s_date(completion_time) if completion_time else dt_now() - - try: - await self.background_job_ops.job_finished( - job_id, job_type, UUID(oid), success=success, finished=finished - ) - # print( - # f"{job_type} background job completed: success: {success}, {job_id}", - # flush=True, - # ) - - # pylint: disable=broad-except - except Exception: - print("Update Background Job Error", flush=True) - traceback.print_exc() - - return {"attachments": [], "finalized": finalized} - - def run_task(self, func): - """add bg tasks to set to avoid premature garbage collection""" - task = asyncio.create_task(func) - self.bg_tasks.add(task) - task.add_done_callback(self.bg_tasks.discard) - - -# ============================================================================ -def init_operator_api(app, *args): - """regsiters webhook handlers for metacontroller""" - - oper = BtrixOperator(*args) - - @app.post("/op/crawls/sync") - async def mc_sync_crawls(data: MCSyncData): - return await oper.sync_crawls(data) - - # reuse sync path, but distinct endpoint for better logging - @app.post("/op/crawls/finalize") - async def mc_sync_finalize(data: MCSyncData): - return await oper.sync_crawls(data) - - @app.post("/op/crawls/customize") - async def mc_related(data: MCBaseRequest): - return oper.get_related(data) - - @app.post("/op/profilebrowsers/sync") - async def mc_sync_profile_browsers(data: MCSyncData): - return await oper.sync_profile_browsers(data) - - @app.post("/op/cronjob/sync") - async def mc_sync_cronjob_crawls(data: MCDecoratorSyncData): - return await oper.sync_cronjob_crawl(data) - - @app.post("/op/cronjob/customize") - async def mc_cronjob_related(data: MCBaseRequest): - return oper.get_cronjob_crawl_related(data) - - # nop, but needed for metacontroller - @app.post("/op/backgroundjob/sync") - async def mc_sync_background_jobs(): - return {"attachments": []} - - @app.post("/op/backgroundjob/finalize") - async def mc_finalize_background_jobs(data: MCDecoratorSyncData): - return await oper.finalize_background_job(data) - - @app.get("/healthz", include_in_schema=False) - async def healthz(): - return {} - - return oper diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py new file mode 100644 index 0000000000..445e86fbca --- /dev/null +++ b/backend/btrixcloud/operator/cronjobs.py @@ -0,0 +1,130 @@ +""" Operator handler for crawl CronJobs """ + +from uuid import UUID +import yaml + +from btrixcloud.utils import to_k8s_date +from .models import MCBaseRequest, MCDecoratorSyncData, CJS, CMAP +from .baseoperator import BaseOperator + + +# pylint: disable=too-many-locals +# ============================================================================ +class CronJobOperator(BaseOperator): + """CronJob Operator""" + + def init_routes(self, app): + """init routes for crawl CronJob decorator""" + + @app.post("/op/cronjob/sync") + async def mc_sync_cronjob_crawls(data: MCDecoratorSyncData): + return await self.sync_cronjob_crawl(data) + + @app.post("/op/cronjob/customize") + async def mc_cronjob_related(data: MCBaseRequest): + return self.get_cronjob_crawl_related(data) + + def get_cronjob_crawl_related(self, data: MCBaseRequest): + """return configmap related to crawl""" + labels = data.parent.get("metadata", {}).get("labels", {}) + cid = labels.get("btrix.crawlconfig") + return { + "relatedResources": [ + { + "apiVersion": "v1", + "resource": "configmaps", + "labelSelector": {"matchLabels": {"btrix.crawlconfig": cid}}, + } + ] + } + + async def sync_cronjob_crawl(self, data: MCDecoratorSyncData): + """create crawljobs from a job object spawned by cronjob""" + + metadata = data.object["metadata"] + labels = metadata.get("labels", {}) + cid = labels.get("btrix.crawlconfig") + + name = metadata.get("name") + crawl_id = name + + actual_state, finished = await self.crawl_ops.get_crawl_state( + crawl_id, is_qa=False + ) + if finished: + status = None + # mark job as completed + if not data.object["status"].get("succeeded"): + print("Cron Job Complete!", finished) + status = { + "succeeded": 1, + "startTime": metadata.get("creationTimestamp"), + "completionTime": to_k8s_date(finished), + } + + return { + "attachments": [], + "annotations": {"finished": finished}, + "status": status, + } + + configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"] + + oid = configmap.get("ORG_ID") + userid = configmap.get("USER_ID") + + crawljobs = data.attachments[CJS] + + org = await self.org_ops.get_org_by_id(UUID(oid)) + + warc_prefix = None + + if not actual_state: + # cronjob doesn't exist yet + crawlconfig = await self.crawl_config_ops.get_crawl_config( + UUID(cid), UUID(oid) + ) + if not crawlconfig: + print( + f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?" + ) + return {"attachments": []} + + # db create + user = await self.user_ops.get_by_id(UUID(userid)) + if not user: + print(f"error: missing user for id {userid}") + return {"attachments": []} + + warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig) + + await self.crawl_config_ops.add_new_crawl( + crawl_id, + crawlconfig, + user, + manual=False, + ) + print("Scheduled Crawl Created: " + crawl_id) + + crawl_id, crawljob = self.k8s.new_crawl_job_yaml( + cid, + userid=userid, + oid=oid, + storage=org.storage, + crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"), + scale=int(configmap.get("INITIAL_SCALE", 1)), + crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)), + max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")), + manual=False, + crawl_id=crawl_id, + warc_prefix=warc_prefix, + ) + + attachments = list(yaml.safe_load_all(crawljob)) + + if crawl_id in crawljobs: + attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"] + + return { + "attachments": attachments, + } diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py new file mode 100644 index 0000000000..f5a2f41473 --- /dev/null +++ b/backend/btrixcloud/operator/models.py @@ -0,0 +1,209 @@ +""" Operator Models """ + +from collections import defaultdict +from uuid import UUID +from typing import Optional, DefaultDict +from pydantic import BaseModel, Field +from kubernetes.utils import parse_quantity +from btrixcloud.models import StorageRef + + +BTRIX_API = "btrix.cloud/v1" + +CMAP = "ConfigMap.v1" +PVC = "PersistentVolumeClaim.v1" +POD = "Pod.v1" +CJS = f"CrawlJob.{BTRIX_API}" + + +# ============================================================================ +class MCBaseRequest(BaseModel): + """base metacontroller model, used for customize hook""" + + parent: dict + controller: dict + + +# ============================================================================ +class MCSyncData(MCBaseRequest): + """sync / finalize metacontroller model""" + + children: dict + related: dict + finalizing: bool = False + + +# ============================================================================ +class MCDecoratorSyncData(BaseModel): + """sync for decoratorcontroller model""" + + object: dict + controller: dict + + attachments: dict + related: dict + finalizing: bool = False + + +# ============================================================================ +class CrawlSpec(BaseModel): + """spec from k8s CrawlJob object""" + + id: str + cid: UUID + oid: UUID + scale: int = 1 + storage: StorageRef + started: str + crawler_channel: str + stopping: bool = False + scheduled: bool = False + timeout: int = 0 + max_crawl_size: int = 0 + qa_source_crawl_id: Optional[str] = "" + + @property + def db_crawl_id(self) -> str: + """return actual crawl_id for db, if qa run""" + return self.qa_source_crawl_id or self.id + + @property + def is_qa(self) -> bool: + """return true if qa run""" + return bool(self.qa_source_crawl_id) + + +# ============================================================================ +class PodResourcePercentage(BaseModel): + """Resource usage percentage ratios""" + + memory: float = 0 + cpu: float = 0 + storage: float = 0 + + +# ============================================================================ +class PodResources(BaseModel): + """Pod Resources""" + + memory: int = 0 + cpu: float = 0 + storage: int = 0 + + def __init__(self, *a, **kw): + if "memory" in kw: + kw["memory"] = int(parse_quantity(kw["memory"])) + if "cpu" in kw: + kw["cpu"] = float(parse_quantity(kw["cpu"])) + if "storage" in kw: + kw["storage"] = int(parse_quantity(kw["storage"])) + super().__init__(*a, **kw) + + +# ============================================================================ +class PodInfo(BaseModel): + """Aggregate pod status info held in CrawlJob""" + + exitTime: Optional[str] = None + exitCode: Optional[int] = None + isNewExit: Optional[bool] = Field(default=None, exclude=True) + reason: Optional[str] = None + + allocated: PodResources = PodResources() + used: PodResources = PodResources() + + newCpu: Optional[int] = None + newMemory: Optional[int] = None + + def dict(self, *a, **kw): + res = super().dict(*a, **kw) + percent = { + "memory": self.get_percent_memory(), + "cpu": self.get_percent_cpu(), + "storage": self.get_percent_storage(), + } + res["percent"] = percent + return res + + def get_percent_memory(self) -> float: + """compute percent memory used""" + return ( + float(self.used.memory) / float(self.allocated.memory) + if self.allocated.memory + else 0 + ) + + def get_percent_cpu(self) -> float: + """compute percent cpu used""" + return ( + float(self.used.cpu) / float(self.allocated.cpu) + if self.allocated.cpu + else 0 + ) + + def get_percent_storage(self) -> float: + """compute percent storage used""" + return ( + float(self.used.storage) / float(self.allocated.storage) + if self.allocated.storage + else 0 + ) + + def should_restart_pod(self): + """return true if pod should be restarted""" + if self.newMemory and self.newMemory != self.allocated.memory: + return True + + if self.newCpu and self.newCpu != self.allocated.cpu: + return True + + return False + + +# ============================================================================ +# pylint: disable=invalid-name +class CrawlStatus(BaseModel): + """status from k8s CrawlJob object""" + + state: str = "starting" + pagesFound: int = 0 + pagesDone: int = 0 + size: int = 0 + # human readable size string + sizeHuman: str = "" + scale: int = 1 + filesAdded: int = 0 + filesAddedSize: int = 0 + finished: Optional[str] = None + stopping: bool = False + stopReason: Optional[str] = None + initRedis: bool = False + crawlerImage: Optional[str] = None + lastActiveTime: str = "" + podStatus: Optional[DefaultDict[str, PodInfo]] = defaultdict( + lambda: PodInfo() # pylint: disable=unnecessary-lambda + ) + # placeholder for pydantic 2.0 -- will require this version + # podStatus: Optional[ + # DefaultDict[str, Annotated[PodInfo, Field(default_factory=PodInfo)]] + # ] + restartTime: Optional[str] + canceled: bool = False + + # updated on pod exits and at regular interval + # Crawl Execution Time -- time all crawler pods have been running + # used to track resource usage and enforce execution minutes limit + crawlExecTime: int = 0 + + # Elapsed Exec Time -- time crawl has been running in at least one pod + # used for crawl timeouts + elapsedCrawlTime: int = 0 + + # last exec time update + lastUpdatedTime: str = "" + + # any pods exited + anyCrawlPodNewExit: Optional[bool] = Field(default=False, exclude=True) + + # don't include in status, use by metacontroller + resync_after: Optional[int] = Field(default=None, exclude=True) diff --git a/backend/btrixcloud/operator/profiles.py b/backend/btrixcloud/operator/profiles.py new file mode 100644 index 0000000000..713252d7c5 --- /dev/null +++ b/backend/btrixcloud/operator/profiles.py @@ -0,0 +1,57 @@ +""" Operator handler for ProfileJobs """ + +from btrixcloud.utils import ( + from_k8s_date, + dt_now, +) + +from btrixcloud.models import StorageRef + +from .models import MCSyncData +from .baseoperator import BaseOperator + + +# ============================================================================ +class ProfileOperator(BaseOperator): + """ProfileOperator""" + + def init_routes(self, app): + """init routes for this operator""" + + @app.post("/op/profilebrowsers/sync") + async def mc_sync_profile_browsers(data: MCSyncData): + return await self.sync_profile_browsers(data) + + async def sync_profile_browsers(self, data: MCSyncData): + """sync profile browsers""" + spec = data.parent.get("spec", {}) + + expire_time = from_k8s_date(spec.get("expireTime")) + browserid = spec.get("id") + + if dt_now() >= expire_time: + self.run_task(self.k8s.delete_profile_browser(browserid)) + return {"status": {}, "children": []} + + params = {} + params.update(self.k8s.shared_params) + params["id"] = browserid + params["userid"] = spec.get("userid", "") + + oid = spec.get("oid") + storage = StorageRef(spec.get("storageName")) + + storage_path = storage.get_storage_extra_path(oid) + storage_secret = storage.get_storage_secret_name(oid) + + params["storage_path"] = storage_path + params["storage_secret"] = storage_secret + params["profile_filename"] = spec.get("profileFilename", "") + params["crawler_image"] = spec["crawlerImage"] + + params["url"] = spec.get("startUrl", "about:blank") + params["vnc_password"] = spec.get("vncPassword") + + children = self.load_from_yaml("profilebrowser.yaml", params) + + return {"status": {}, "children": children} diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 01a31551ee..a90cafcba0 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -1,6 +1,7 @@ """ Organization API handling """ + # pylint: disable=too-many-lines import math @@ -21,6 +22,7 @@ SUCCESSFUL_STATES, RUNNING_STATES, STARTING_STATES, + BaseCrawl, Organization, StorageRef, OrgQuotas, @@ -520,18 +522,22 @@ async def set_origin(self, org: Organization, request: Request): {"_id": org.id}, {"$set": {"origin": origin}} ) - async def inc_org_time_stats(self, oid, duration, is_exec_time=False): + async def inc_org_time_stats(self, oid, duration, is_exec_time=False, is_qa=False): """inc crawl duration stats for org Overage is applied only to crawlExecSeconds - monthlyExecSeconds, giftedExecSeconds, and extraExecSeconds are added to only up to quotas + + If is_qa is true, also update seperate qa only counter """ - # pylint: disable=too-many-return-statements + # pylint: disable=too-many-return-statements, too-many-locals key = "crawlExecSeconds" if is_exec_time else "usage" yymm = datetime.utcnow().strftime("%Y-%m") - await self.orgs.find_one_and_update( - {"_id": oid}, {"$inc": {f"{key}.{yymm}": duration}} - ) + inc_query = {f"{key}.{yymm}": duration} + if is_qa: + qa_key = "qaCrawlExecSeconds" if is_exec_time else "qaUsage" + inc_query[f"{qa_key}.{yymm}"] = duration + await self.orgs.find_one_and_update({"_id": oid}, {"$inc": inc_query}) if not is_exec_time: return @@ -630,17 +636,17 @@ async def get_org_metrics(self, org: Organization): upload_count = 0 page_count = 0 - async for item in self.crawls_db.find({"oid": org.id}): - if item["state"] not in SUCCESSFUL_STATES: + async for item_data in self.crawls_db.find({"oid": org.id}): + item = BaseCrawl.from_dict(item_data) + if item.state not in SUCCESSFUL_STATES: continue archived_item_count += 1 - type_ = item.get("type") - if type_ == "crawl": + if item.type == "crawl": crawl_count += 1 - if type_ == "upload": + if item.type == "upload": upload_count += 1 - if item.get("stats"): - page_count += item.get("stats", {}).get("done", 0) + if item.stats: + page_count += item.stats.done profile_count = await self.profiles_db.count_documents({"oid": org.id}) workflows_running_count = await self.crawls_db.count_documents( diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py new file mode 100644 index 0000000000..231ff9cb58 --- /dev/null +++ b/backend/btrixcloud/pages.py @@ -0,0 +1,631 @@ +"""crawl pages""" + +import asyncio +import traceback +from datetime import datetime +from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union +from uuid import UUID, uuid4 + +from fastapi import Depends, HTTPException +import pymongo + +from .models import ( + Page, + PageOut, + PageOutWithSingleQA, + PageReviewUpdate, + PageQACompare, + Organization, + PaginatedResponse, + User, + PageNote, + PageNoteIn, + PageNoteEdit, + PageNoteDelete, +) +from .pagination import DEFAULT_PAGE_SIZE, paginated_format +from .utils import from_k8s_date + +if TYPE_CHECKING: + from .crawls import CrawlOps + from .orgs import OrgOps + from .storages import StorageOps +else: + CrawlOps = StorageOps = OrgOps = object + + +# ============================================================================ +# pylint: disable=too-many-instance-attributes, too-many-arguments +class PageOps: + """crawl pages""" + + crawl_ops: CrawlOps + org_ops: OrgOps + storage_ops: StorageOps + + def __init__(self, mdb, crawl_ops, org_ops, storage_ops): + self.pages = mdb["pages"] + self.crawls = mdb["crawls"] + self.crawl_ops = crawl_ops + self.org_ops = org_ops + self.storage_ops = storage_ops + + async def init_index(self): + """init index for pages db collection""" + await self.pages.create_index([("crawl_id", pymongo.HASHED)]) + + async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): + """Add pages to database from WACZ files""" + pages_buffer: List[Page] = [] + try: + crawl = await self.crawl_ops.get_crawl_out(crawl_id) + stream = await self.storage_ops.sync_stream_wacz_pages( + crawl.resources or [] + ) + for page_dict in stream: + if not page_dict.get("url"): + continue + + if len(pages_buffer) > batch_size: + await self._add_pages_to_db(pages_buffer) + + pages_buffer.append( + self._get_page_from_dict(page_dict, crawl_id, crawl.oid) + ) + + # Add any remaining pages in buffer to db + if pages_buffer: + await self._add_pages_to_db(pages_buffer) + + print(f"Added pages for crawl {crawl_id} to db", flush=True) + # pylint: disable=broad-exception-caught, raise-missing-from + except Exception as err: + traceback.print_exc() + print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) + + def _get_page_from_dict(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID): + """Return Page object from dict""" + page_id = page_dict.get("id") + if not page_id: + print(f'Page {page_dict.get("url")} has no id - assigning UUID', flush=True) + + status = page_dict.get("status") + if not status and page_dict.get("loadState"): + status = 200 + + return Page( + id=page_id, + oid=oid, + crawl_id=crawl_id, + url=page_dict.get("url"), + title=page_dict.get("title"), + loadState=page_dict.get("loadState"), + status=status, + ts=( + from_k8s_date(page_dict.get("ts")) + if page_dict.get("ts") + else datetime.now() + ), + ) + + async def _add_pages_to_db(self, pages: List[Page]): + """Add batch of pages to db in one insert""" + result = await self.pages.insert_many( + [ + page.to_dict( + exclude_unset=True, exclude_none=True, exclude_defaults=True + ) + for page in pages + ] + ) + if not result.inserted_ids: + # pylint: disable=broad-exception-raised + raise Exception("No pages inserted") + + async def add_page_to_db( + self, + page_dict: Dict[str, Any], + crawl_id: str, + qa_run_id: Optional[str], + oid: UUID, + ): + """Add page to database""" + page = self._get_page_from_dict(page_dict, crawl_id, oid) + + try: + await self.pages.insert_one( + page.to_dict( + exclude_unset=True, exclude_none=True, exclude_defaults=True + ) + ) + except pymongo.errors.DuplicateKeyError: + pass + + # pylint: disable=broad-except + except Exception as err: + print( + f"Error adding page {page.id} from crawl {crawl_id} to db: {err}", + flush=True, + ) + return + + # qa data + if qa_run_id and page: + compare_dict = page_dict.get("comparison") + if compare_dict is None: + print("QA Run, but compare data missing!") + return + + compare = PageQACompare(**compare_dict) + print("Adding QA Run Data for Page", page_dict.get("url"), compare) + + await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare) + + async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None): + """Delete crawl pages from db""" + query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id} + if oid: + query["oid"] = oid + try: + await self.pages.delete_many(query) + # pylint: disable=broad-except + except Exception as err: + print( + f"Error deleting pages from crawl {crawl_id}: {err}", + flush=True, + ) + + async def get_page_raw( + self, + page_id: UUID, + oid: UUID, + crawl_id: Optional[str] = None, + ) -> Dict[str, Any]: + """Return page dict by id""" + query: Dict[str, Union[str, UUID]] = {"_id": page_id, "oid": oid} + if crawl_id: + query["crawl_id"] = crawl_id + + page = await self.pages.find_one(query) + if not page: + raise HTTPException(status_code=404, detail="page_not_found") + return page + + async def get_page( + self, + page_id: UUID, + oid: UUID, + crawl_id: Optional[str] = None, + ) -> Page: + """Return Page object by id""" + page_raw = await self.get_page_raw(page_id, oid, crawl_id) + return Page.from_dict(page_raw) + + async def add_qa_run_for_page( + self, page_id: UUID, oid: UUID, qa_run_id: str, compare: PageQACompare + ) -> bool: + """Update page heuristics and mime/type from QA run""" + + # modified = datetime.utcnow().replace(microsecond=0, tzinfo=None) + + result = await self.pages.find_one_and_update( + {"_id": page_id, "oid": oid}, + {"$set": {f"qa.{qa_run_id}": compare.dict()}}, + return_document=pymongo.ReturnDocument.AFTER, + ) + + if not result: + raise HTTPException(status_code=404, detail="page_not_found") + + return True + + async def delete_qa_run_from_pages(self, crawl_id: str, qa_run_id: str): + """delete pages""" + result = await self.pages.update_many( + {"crawl_id": crawl_id}, {"$unset": {f"qa.{qa_run_id}": ""}} + ) + return result + + async def update_page_approval( + self, + page_id: UUID, + oid: UUID, + approved: Optional[bool] = None, + crawl_id: Optional[str] = None, + user: Optional[User] = None, + ) -> Dict[str, bool]: + """Update page manual review""" + query: Dict[str, Union[Optional[bool], str, datetime, UUID]] = { + "approved": approved + } + query["modified"] = datetime.utcnow().replace(microsecond=0, tzinfo=None) + if user: + query["userid"] = user.id + + result = await self.pages.find_one_and_update( + {"_id": page_id, "oid": oid, "crawl_id": crawl_id}, + {"$set": query}, + return_document=pymongo.ReturnDocument.AFTER, + ) + + if not result: + raise HTTPException(status_code=404, detail="page_not_found") + + return {"updated": True} + + async def add_page_note( + self, + page_id: UUID, + oid: UUID, + text: str, + user: User, + crawl_id: str, + ) -> Dict[str, bool]: + """Add note to page""" + note = PageNote(id=uuid4(), text=text, userid=user.id, userName=user.name) + + modified = datetime.utcnow().replace(microsecond=0, tzinfo=None) + + result = await self.pages.find_one_and_update( + {"_id": page_id, "oid": oid, "crawl_id": crawl_id}, + { + "$push": {"notes": note.dict()}, + "$set": {"modified": modified}, + }, + return_document=pymongo.ReturnDocument.AFTER, + ) + + if not result: + raise HTTPException(status_code=404, detail="page_not_found") + + return {"added": True} + + async def update_page_note( + self, + page_id: UUID, + oid: UUID, + note_in: PageNoteEdit, + user: User, + crawl_id: str, + ) -> Dict[str, bool]: + """Update specific page note""" + page = await self.get_page_raw(page_id, oid) + page_notes = page.get("notes", []) + + try: + matching_index = [ + index + for index, note in enumerate(page_notes) + if note["id"] == note_in.id + ][0] + + except IndexError: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=404, detail="page_note_not_found") + + new_note = PageNote( + id=note_in.id, text=note_in.text, userid=user.id, userName=user.name + ) + page_notes[matching_index] = new_note.dict() + + modified = datetime.utcnow().replace(microsecond=0, tzinfo=None) + + result = await self.pages.find_one_and_update( + {"_id": page_id, "oid": oid, "crawl_id": crawl_id}, + {"$set": {"notes": page_notes, "modified": modified}}, + return_document=pymongo.ReturnDocument.AFTER, + ) + + if not result: + raise HTTPException(status_code=404, detail="page_not_found") + + return {"updated": True} + + async def delete_page_notes( + self, + page_id: UUID, + oid: UUID, + delete: PageNoteDelete, + crawl_id: str, + ) -> Dict[str, bool]: + """Delete specific page notes""" + page = await self.get_page_raw(page_id, oid) + page_notes = page.get("notes", []) + + remaining_notes = [] + for note in page_notes: + if not note.get("id") in delete.delete_list: + remaining_notes.append(note) + + modified = datetime.utcnow().replace(microsecond=0, tzinfo=None) + + result = await self.pages.find_one_and_update( + {"_id": page_id, "oid": oid, "crawl_id": crawl_id}, + {"$set": {"notes": remaining_notes, "modified": modified}}, + return_document=pymongo.ReturnDocument.AFTER, + ) + + if not result: + raise HTTPException(status_code=404, detail="page_not_found") + + return {"deleted": True} + + async def list_pages( + self, + crawl_id: str, + org: Optional[Organization] = None, + qa_run_id: Optional[str] = None, + qa_filter_by: Optional[str] = None, + qa_gte: Optional[float] = None, + qa_gt: Optional[float] = None, + qa_lte: Optional[float] = None, + qa_lt: Optional[float] = None, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sort_by: Optional[str] = None, + sort_direction: Optional[int] = -1, + ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: + """List all pages in crawl""" + # pylint: disable=duplicate-code, too-many-locals, too-many-branches + # Zero-index page for query + page = page - 1 + skip = page_size * page + + query: dict[str, object] = { + "crawl_id": crawl_id, + } + if org: + query["oid"] = org.id + + if qa_run_id: + query[f"qa.{qa_run_id}"] = {"$exists": True} + + range_filter = {} + + if qa_gte: + range_filter["$gte"] = qa_gte + if qa_lte: + range_filter["$lte"] = qa_lte + if qa_gt: + range_filter["$gt"] = qa_gt + if qa_lt: + range_filter["$lt"] = qa_lt + + if qa_filter_by: + if not range_filter: + raise HTTPException(status_code=400, detail="range_missing") + + query[f"qa.{qa_run_id}.{qa_filter_by}"] = range_filter + + aggregate = [{"$match": query}] + + if sort_by: + # Sorting options to add: + # - automated heuristics like screenshot_comparison (dict keyed by QA run id) + # - Ensure notes sorting works okay with notes in list + sort_fields = ("url", "title", "notes", "approved") + qa_sort_fields = ("screenshotMatch", "textMatch") + if sort_by not in sort_fields and sort_by not in qa_sort_fields: + raise HTTPException(status_code=400, detail="invalid_sort_by") + if sort_direction not in (1, -1): + raise HTTPException(status_code=400, detail="invalid_sort_direction") + + if sort_by in qa_sort_fields: + if not qa_run_id: + raise HTTPException( + status_code=400, detail="qa_run_id_missing_for_qa_sort" + ) + + sort_by = f"qa.{qa_run_id}.{sort_by}" + + aggregate.extend([{"$sort": {sort_by: sort_direction}}]) + + if qa_run_id: + aggregate.extend([{"$set": {"qa": f"$qa.{qa_run_id}"}}]) + # aggregate.extend([{"$project": {"qa": f"$qa.{qa_run_id}"}}]) + + aggregate.extend( + [ + { + "$facet": { + "items": [ + {"$skip": skip}, + {"$limit": page_size}, + ], + "total": [{"$count": "count"}], + } + }, + ] + ) + + # Get total + cursor = self.pages.aggregate(aggregate) + results = await cursor.to_list(length=1) + result = results[0] + items = result["items"] + + try: + total = int(result["total"][0]["count"]) + except (IndexError, ValueError): + total = 0 + + if qa_run_id: + return [PageOutWithSingleQA.from_dict(data) for data in items], total + + return [PageOut.from_dict(data) for data in items], total + + async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): + """Delete existing pages for crawl and re-add from WACZs.""" + await self.delete_crawl_pages(crawl_id, oid) + print(f"Deleted pages for crawl {crawl_id}", flush=True) + await self.add_crawl_pages_to_db_from_wacz(crawl_id) + + async def re_add_all_crawl_pages(self, oid: UUID): + """Re-add pages for all crawls in org""" + crawl_ids = await self.crawls.distinct( + "_id", {"type": "crawl", "finished": {"$ne": None}} + ) + for crawl_id in crawl_ids: + await self.re_add_crawl_pages(crawl_id, oid) + + +# ============================================================================ +# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme +def init_pages_api(app, mdb, crawl_ops, org_ops, storage_ops, user_dep): + """init pages API""" + # pylint: disable=invalid-name + + ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) + + org_crawl_dep = org_ops.org_crawl_dep + + @app.post("/orgs/{oid}/crawls/all/pages/reAdd", tags=["pages"]) + async def re_add_all_crawl_pages( + org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep) + ): + """Re-add pages for all crawls in org (superuser only)""" + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + asyncio.create_task(ops.re_add_all_crawl_pages(org.id)) + return {"started": True} + + @app.post("/orgs/{oid}/crawls/{crawl_id}/pages/reAdd", tags=["pages"]) + async def re_add_crawl_pages( + crawl_id: str, org: Organization = Depends(org_crawl_dep) + ): + """Re-add pages for crawl""" + asyncio.create_task(ops.re_add_crawl_pages(crawl_id, org.id)) + return {"started": True} + + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}", + tags=["pages"], + response_model=Page, + ) + async def get_page( + crawl_id: str, + page_id: UUID, + org: Organization = Depends(org_crawl_dep), + ): + """GET single page""" + return await ops.get_page(page_id, org.id, crawl_id) + + @app.patch( + "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}", + tags=["pages"], + ) + async def update_page_approval( + crawl_id: str, + page_id: UUID, + update: PageReviewUpdate, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), + ): + """Update review for specific page""" + return await ops.update_page_approval( + page_id, org.id, update.approved, crawl_id, user + ) + + @app.post( + "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}/notes", + tags=["pages"], + ) + async def add_page_note( + crawl_id: str, + page_id: UUID, + note: PageNoteIn, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), + ): + """Add note to page""" + return await ops.add_page_note(page_id, org.id, note.text, user, crawl_id) + + @app.patch( + "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}/notes", + tags=["pages"], + ) + async def edit_page_note( + crawl_id: str, + page_id: UUID, + note: PageNoteEdit, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), + ): + """Edit page note""" + return await ops.update_page_note(page_id, org.id, note, user, crawl_id) + + @app.post( + "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}/notes/delete", + tags=["pages"], + ) + async def delete_page_notes( + crawl_id: str, + page_id: UUID, + delete: PageNoteDelete, + org: Organization = Depends(org_crawl_dep), + ): + """Edit page note""" + return await ops.delete_page_notes(page_id, org.id, delete, crawl_id) + + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/pages", + tags=["pages"], + response_model=PaginatedResponse, + ) + async def get_pages_list( + crawl_id: str, + org: Organization = Depends(org_crawl_dep), + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sortBy: Optional[str] = None, + sortDirection: Optional[int] = -1, + ): + """Retrieve paginated list of pages""" + pages, total = await ops.list_pages( + crawl_id=crawl_id, + org=org, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + ) + return paginated_format(pages, total, page, pageSize) + + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages", + tags=["pages", "qa"], + response_model=PaginatedResponse, + ) + async def get_pages_list_with_qa( + crawl_id: str, + qa_run_id: str, + filterQABy: Optional[str] = None, + gte: Optional[float] = None, + gt: Optional[float] = None, + lte: Optional[float] = None, + lt: Optional[float] = None, + org: Organization = Depends(org_crawl_dep), + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sortBy: Optional[str] = None, + sortDirection: Optional[int] = -1, + ): + """Retrieve paginated list of pages""" + pages, total = await ops.list_pages( + crawl_id=crawl_id, + org=org, + qa_run_id=qa_run_id, + qa_filter_by=filterQABy, + qa_gte=gte, + qa_gt=gt, + qa_lte=lte, + qa_lt=lt, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + ) + return paginated_format(pages, total, page, pageSize) + + return ops diff --git a/backend/btrixcloud/pagination.py b/backend/btrixcloud/pagination.py index 4823613368..9b9e727060 100644 --- a/backend/btrixcloud/pagination.py +++ b/backend/btrixcloud/pagination.py @@ -1,4 +1,5 @@ """API pagination""" + from typing import Any, List, Optional diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index a15406940a..835aadffee 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -1,6 +1,7 @@ """ Storage API """ + from typing import ( Optional, Iterator, @@ -9,21 +10,24 @@ Dict, AsyncIterator, TYPE_CHECKING, + Any, ) from urllib.parse import urlsplit -from contextlib import asynccontextmanager, contextmanager +from contextlib import asynccontextmanager +from itertools import chain import asyncio import heapq import zlib import json -import itertools import os from datetime import datetime +from zipfile import ZipInfo from fastapi import Depends, HTTPException from stream_zip import stream_zip, NO_COMPRESSION_64 +from remotezip import RemoteZip import aiobotocore.session import boto3 @@ -41,10 +45,6 @@ S3StorageIn, OrgStorageRefs, ) -from .zip import ( - sync_get_zip_file, - sync_get_log_stream, -) from .utils import is_bool, slug_from_name @@ -72,12 +72,21 @@ class StorageOps: org_ops: OrgOps crawl_manager: CrawlManager + is_local_minio: bool + frontend_origin: str + def __init__(self, org_ops, crawl_manager) -> None: self.org_ops = org_ops self.crawl_manager = crawl_manager self.is_local_minio = is_bool(os.environ.get("IS_LOCAL_MINIO")) + frontend_origin = os.environ.get( + "FRONTEND_ORIGIN", "http://browsertrix-cloud-frontend" + ) + default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default") + self.frontend_origin = f"{frontend_origin}.{default_namespace}" + with open(os.environ["STORAGES_JSON"], encoding="utf-8") as fh: storage_list = json.loads(fh.read()) @@ -275,8 +284,10 @@ async def get_s3_client( ) as client: yield client, bucket, key - @contextmanager - def get_sync_client(self, org: Organization) -> Iterator[tuple[S3Client, str, str]]: + @asynccontextmanager + async def get_sync_client( + self, org: Organization + ) -> AsyncIterator[tuple[S3Client, str, str]]: """context manager for s3 client""" storage = self.get_org_primary_storage(org) @@ -312,6 +323,12 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None resp = await client.put_object(Bucket=bucket, Key=key, Body=data) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + def resolve_internal_access_path(self, path): + """Resolve relative path for internal access to minio bucket""" + if path.startswith("/"): + return self.frontend_origin + path + return path + def get_org_relative_path( self, org: Organization, ref: StorageRef, file_path: str ) -> str: @@ -497,7 +514,7 @@ async def _delete_file( s3storage = self.get_org_storage_by_ref(org, storage) - async with self.get_s3_client(s3storage, s3storage.use_access_for_presign) as ( + async with self.get_s3_client(s3storage) as ( client, bucket, key, @@ -508,55 +525,53 @@ async def _delete_file( return status_code == 204 + async def sync_stream_wacz_pages( + self, wacz_files: List[CrawlFileOut] + ) -> Iterator[Dict[Any, Any]]: + """Return stream of pages specified WACZ""" + loop = asyncio.get_event_loop() + + resp = await loop.run_in_executor(None, self._sync_get_pages, wacz_files) + + return resp + async def sync_stream_wacz_logs( self, - org: Organization, - wacz_files: List[CrawlFile], + wacz_files: List[CrawlFileOut], log_levels: List[str], contexts: List[str], ) -> Iterator[bytes]: """Return filtered stream of logs from specified WACZs sorted by timestamp""" - with self.get_sync_client(org) as (client, bucket, key): - loop = asyncio.get_event_loop() - - resp = await loop.run_in_executor( - None, - self._sync_get_logs, - wacz_files, - log_levels, - contexts, - client, - bucket, - key, - ) + loop = asyncio.get_event_loop() + + resp = await loop.run_in_executor( + None, + self._sync_get_logs, + wacz_files, + log_levels, + contexts, + ) - return resp + return resp def _sync_get_logs( self, - wacz_files: List[CrawlFile], + wacz_files: List[CrawlFileOut], log_levels: List[str], contexts: List[str], - client, - bucket: str, - key: str, ) -> Iterator[bytes]: """Generate filtered stream of logs from specified WACZs sorted by timestamp""" # pylint: disable=too-many-function-args def stream_log_lines( - wacz_key, wacz_filename, cd_start, log_zipinfo + log_zipinfo: ZipInfo, wacz_url: str, wacz_filename: str ) -> Iterator[dict]: """Pass lines as json objects""" + filename = log_zipinfo.filename - print( - f"Fetching log {log_zipinfo.filename} from {wacz_filename}", flush=True - ) - - line_iter: Iterator[bytes] = sync_get_log_stream( - client, bucket, wacz_key, log_zipinfo, cd_start - ) + print(f"Fetching log {filename} from {wacz_filename}", flush=True) + line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename) for line in line_iter: yield _parse_json(line.decode("utf-8", errors="ignore")) @@ -573,14 +588,14 @@ def stream_json_lines( yield json_str.encode("utf-8") def organize_based_on_instance_number( - wacz_files: List[CrawlFile], - ) -> List[List[CrawlFile]]: + wacz_files: List[CrawlFileOut], + ) -> List[List[CrawlFileOut]]: """Place wacz_files into their own list based on instance number""" - wacz_files.sort(key=lambda file: file.filename) - waczs_groups: Dict[str, List[CrawlFile]] = {} + wacz_files.sort(key=lambda file: file.name) + waczs_groups: Dict[str, List[CrawlFileOut]] = {} for file in wacz_files: - instance_number = file.filename[ - file.filename.rfind("-") + 1 : file.filename.rfind(".") + instance_number = file.name[ + file.name.rfind("-") + 1 : file.name.rfind(".") ] if instance_number in waczs_groups: waczs_groups[instance_number].append(file) @@ -595,29 +610,73 @@ def organize_based_on_instance_number( wacz_log_streams: List[Iterator[dict]] = [] for wacz_file in instance_list: - wacz_key = key + wacz_file.filename - cd_start, zip_file = sync_get_zip_file(client, bucket, wacz_key) - - log_files = [ - f - for f in zip_file.filelist - if f.filename.startswith("logs/") and not f.is_dir() - ] - log_files.sort(key=lambda log_zipinfo: log_zipinfo.filename) - - for log_zipinfo in log_files: - wacz_log_streams.append( - stream_log_lines( - wacz_key, wacz_file.filename, cd_start, log_zipinfo + wacz_url = self.resolve_internal_access_path(wacz_file.path) + with RemoteZip(wacz_url) as remote_zip: + log_files: List[ZipInfo] = [ + f + for f in remote_zip.infolist() + if f.filename.startswith("logs/") and not f.is_dir() + ] + log_files.sort(key=lambda log_zipinfo: log_zipinfo.filename) + + for log_zipinfo in log_files: + wacz_log_streams.append( + stream_log_lines(log_zipinfo, wacz_url, wacz_file.name) ) - ) - log_generators.append(itertools.chain(*wacz_log_streams)) + log_generators.append(chain(*wacz_log_streams)) heap_iter = heapq.merge(*log_generators, key=lambda entry: entry["timestamp"]) return stream_json_lines(heap_iter, log_levels, contexts) + def _sync_get_pages( + self, + wacz_files: List[CrawlFileOut], + ) -> Iterator[Dict[Any, Any]]: + """Generate stream of page dicts from specified WACZs""" + + # pylint: disable=too-many-function-args + def stream_page_lines( + pagefile_zipinfo: ZipInfo, wacz_url: str, wacz_filename: str + ) -> Iterator[Dict[Any, Any]]: + """Pass lines as json objects""" + filename = pagefile_zipinfo.filename + + print( + f"Fetching JSON lines from {filename} in {wacz_filename}", + flush=True, + ) + + line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename) + for line in line_iter: + yield _parse_json(line.decode("utf-8", errors="ignore")) + + page_generators: List[Iterator[Dict[Any, Any]]] = [] + + for wacz_file in wacz_files: + wacz_url = self.resolve_internal_access_path(wacz_file.path) + with RemoteZip(wacz_url) as remote_zip: + page_files: List[ZipInfo] = [ + f + for f in remote_zip.infolist() + if f.filename.startswith("pages/") + and f.filename.endswith(".jsonl") + and not f.is_dir() + ] + for pagefile_zipinfo in page_files: + page_generators.append( + stream_page_lines(pagefile_zipinfo, wacz_url, wacz_file.name) + ) + + return chain.from_iterable(page_generators) + + def _sync_get_filestream(self, wacz_url: str, filename: str) -> Iterator[bytes]: + """Return iterator of lines in remote file as bytes""" + with RemoteZip(wacz_url) as remote_zip: + with remote_zip.open(filename) as file_stream: + yield from file_stream + def _sync_dl( self, all_files: List[CrawlFileOut], client: S3Client, bucket: str, key: str ) -> Iterator[bytes]: @@ -664,7 +723,7 @@ async def download_streaming_wacz( ) -> Iterator[bytes]: """return an iter for downloading a stream nested wacz file from list of files""" - with self.get_sync_client(org) as (client, bucket, key): + async with self.get_sync_client(org) as (client, bucket, key): loop = asyncio.get_event_loop() resp = await loop.run_in_executor( diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 6f8f6474a0..2b3f6e2023 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -39,6 +39,15 @@ class UploadOps(BaseCrawlOps): """upload ops""" + async def get_upload( + self, + crawlid: str, + org: Optional[Organization] = None, + ) -> UploadedCrawl: + """Get crawl data for internal use""" + res = await self.get_crawl_raw(crawlid, org, "upload") + return UploadedCrawl.from_dict(res) + # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-function-args # pylint: disable=too-many-arguments, too-many-locals, duplicate-code, invalid-name async def upload_stream( @@ -60,7 +69,7 @@ async def upload_stream( prev_upload = None if replaceId: try: - prev_upload = await self.get_crawl_raw(replaceId, org, "upload") + prev_upload = await self.get_upload(replaceId, org) except HTTPException: # not found replaceId = None @@ -371,7 +380,7 @@ async def list_uploads( response_model=CrawlOut, ) async def get_upload(crawlid: str, org: Organization = Depends(org_crawl_dep)): - return await ops.get_crawl(crawlid, org, "upload") + return await ops.get_crawl_out(crawlid, org, "upload") @app.get( "/orgs/all/uploads/{crawl_id}/replay.json", @@ -382,7 +391,7 @@ async def get_upload_replay_admin(crawl_id, user: User = Depends(user_dep)): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_crawl(crawl_id, None, "upload") + return await ops.get_crawl_out(crawl_id, None, "upload") @app.get( "/orgs/{oid}/uploads/{crawl_id}/replay.json", @@ -390,7 +399,7 @@ async def get_upload_replay_admin(crawl_id, user: User = Depends(user_dep)): response_model=CrawlOutWithResources, ) async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_crawl(crawl_id, org, "upload") + return await ops.get_crawl_out(crawl_id, org, "upload") @app.patch("/orgs/{oid}/uploads/{crawl_id}", tags=["uploads"]) async def update_uploads_api( diff --git a/backend/btrixcloud/version.py b/backend/btrixcloud/version.py index 158676b2b8..5375dd619a 100644 --- a/backend/btrixcloud/version.py +++ b/backend/btrixcloud/version.py @@ -1,2 +1,3 @@ """ current version """ -__version__ = "1.9.0-beta.2" + +__version__ = "1.10.0-beta.0" diff --git a/backend/btrixcloud/webhooks.py b/backend/btrixcloud/webhooks.py index aafe10e80c..5b3ec99243 100644 --- a/backend/btrixcloud/webhooks.py +++ b/backend/btrixcloud/webhooks.py @@ -195,12 +195,12 @@ async def _create_item_finished_notification( body: Union[CrawlFinishedBody, UploadFinishedBody], ): """Create webhook notification for finished crawl/upload.""" - crawl = await self.crawl_ops.get_crawl(crawl_id, org) + crawl = await self.crawl_ops.get_crawl_out(crawl_id, org) if not crawl: print(f"Crawl {crawl_id} not found, skipping event webhook", flush=True) return - body.resources = crawl.resources + body.resources = crawl.resources or [] notification = WebhookNotification( id=uuid4(), diff --git a/backend/btrixcloud/zip.py b/backend/btrixcloud/zip.py deleted file mode 100644 index e1c0d445ac..0000000000 --- a/backend/btrixcloud/zip.py +++ /dev/null @@ -1,198 +0,0 @@ -""" -Methods for interacting with zip/WACZ files -""" -import io -import struct -import zipfile -import zlib - - -# ============================================================================ -EOCD_RECORD_SIZE = 22 -ZIP64_EOCD_RECORD_SIZE = 56 -ZIP64_EOCD_LOCATOR_SIZE = 20 - -MAX_STANDARD_ZIP_SIZE = 4_294_967_295 - -CHUNK_SIZE = 1024 * 256 - - -# ============================================================================ -def sync_get_log_stream(client, bucket, key, log_zipinfo, cd_start): - """Return uncompressed byte stream of log file in WACZ""" - # pylint: disable=too-many-locals - file_head = sync_fetch( - client, bucket, key, cd_start + log_zipinfo.header_offset + 26, 4 - ) - name_len = parse_little_endian_to_int(file_head[0:2]) - extra_len = parse_little_endian_to_int(file_head[2:4]) - - content = sync_fetch_stream( - client, - bucket, - key, - cd_start + log_zipinfo.header_offset + 30 + name_len + extra_len, - log_zipinfo.compress_size, - ) - - if log_zipinfo.compress_type == zipfile.ZIP_DEFLATED: - uncompressed_content = zlib.decompressobj(-zlib.MAX_WBITS).decompress(content) - else: - uncompressed_content = content - - return sync_iter_lines(uncompressed_content) - - -def sync_iter_lines(chunk_iter, keepends=True): - """ - Iter by lines, adapted from botocore - """ - pending = b"" - for chunk in chunk_iter: - lines = (pending + chunk).splitlines(True) - for line in lines[:-1]: - yield line.splitlines(keepends)[0] - pending = lines[-1] - if pending: - yield pending.splitlines(keepends)[0] - - -async def get_zip_file(client, bucket, key): - """Fetch enough of the WACZ file be able to read the zip filelist""" - file_size = await get_file_size(client, bucket, key) - eocd_record = await fetch( - client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE - ) - - if file_size <= MAX_STANDARD_ZIP_SIZE: - cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record) - central_directory = await fetch(client, bucket, key, cd_start, cd_size) - return ( - cd_start, - zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)), - ) - - zip64_eocd_record = await fetch( - client, - bucket, - key, - file_size - - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE), - ZIP64_EOCD_RECORD_SIZE, - ) - zip64_eocd_locator = await fetch( - client, - bucket, - key, - file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE), - ZIP64_EOCD_LOCATOR_SIZE, - ) - cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record) - central_directory = await fetch(client, bucket, key, cd_start, cd_size) - return ( - cd_start, - zipfile.ZipFile( - io.BytesIO( - central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record - ) - ), - ) - - -def sync_get_zip_file(client, bucket, key): - """Fetch enough of the WACZ file be able to read the zip filelist""" - file_size = sync_get_file_size(client, bucket, key) - eocd_record = sync_fetch( - client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE - ) - - if file_size <= MAX_STANDARD_ZIP_SIZE: - cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record) - central_directory = sync_fetch(client, bucket, key, cd_start, cd_size) - with zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)) as zip_file: - return (cd_start, zip_file) - - zip64_eocd_record = sync_fetch( - client, - bucket, - key, - file_size - - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE), - ZIP64_EOCD_RECORD_SIZE, - ) - zip64_eocd_locator = sync_fetch( - client, - bucket, - key, - file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE), - ZIP64_EOCD_LOCATOR_SIZE, - ) - cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record) - central_directory = sync_fetch(client, bucket, key, cd_start, cd_size) - with zipfile.ZipFile( - io.BytesIO( - central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record - ) - ) as zip_file: - return (cd_start, zip_file) - - -async def get_file_size(client, bucket, key): - """Get WACZ file size from HEAD request""" - head_response = await client.head_object(Bucket=bucket, Key=key) - return head_response["ContentLength"] - - -def sync_get_file_size(client, bucket, key): - """Get WACZ file size from HEAD request""" - head_response = client.head_object(Bucket=bucket, Key=key) - return head_response["ContentLength"] - - -async def fetch(client, bucket, key, start, length): - """Fetch a byte range from a file in object storage""" - end = start + length - 1 - response = await client.get_object( - Bucket=bucket, Key=key, Range=f"bytes={start}-{end}" - ) - return await response["Body"].read() - - -def sync_fetch(client, bucket, key, start, length): - """Fetch a byte range from a file in object storage""" - end = start + length - 1 - response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}") - return response["Body"].read() - - -def sync_fetch_stream(client, bucket, key, start, length): - """Fetch a byte range from a file in object storage as a stream""" - end = start + length - 1 - response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}") - return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE) - - -def get_central_directory_metadata_from_eocd(eocd): - """Get central directory start and size""" - cd_size = parse_little_endian_to_int(eocd[12:16]) - cd_start = parse_little_endian_to_int(eocd[16:20]) - return cd_start, cd_size - - -def get_central_directory_metadata_from_eocd64(eocd64): - """Get central directory start and size for zip64""" - cd_size = parse_little_endian_to_int(eocd64[40:48]) - cd_start = parse_little_endian_to_int(eocd64[48:56]) - return cd_start, cd_size - - -def parse_little_endian_to_int(little_endian_bytes): - """Convert little endian used in zip spec to int""" - byte_length = len(little_endian_bytes) - format_character = "q" - if byte_length == 4: - format_character = "i" - elif byte_length == 2: - format_character = "h" - - return struct.unpack("<" + format_character, little_endian_bytes)[0] diff --git a/backend/requirements.txt b/backend/requirements.txt index 0877636c95..078472546e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -9,7 +9,7 @@ email-validator #fastapi-users[mongodb]==9.2.2 loguru aiofiles -kubernetes-asyncio==25.11.0 +kubernetes-asyncio==29.0.0 kubernetes aiobotocore redis>=5.0.0 @@ -28,3 +28,4 @@ types_aiobotocore_s3 types-redis types-python-slugify types-pyYAML +remotezip diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py new file mode 100644 index 0000000000..6029541843 --- /dev/null +++ b/backend/test/test_qa.py @@ -0,0 +1,187 @@ +from .conftest import API_PREFIX, HOST_PREFIX +import requests +import time +from datetime import datetime + +qa_run_id = None + + +def test_run_qa(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start", + headers=crawler_auth_headers, + ) + + assert r.status_code == 200 + + data = r.json() + assert data["started"] + global qa_run_id + qa_run_id = data["started"] + + +def test_run_qa_already_running(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start", + headers=crawler_auth_headers, + ) + + assert r.status_code == 400 + assert r.json()["detail"] == "qa_already_running" + + +def test_active_qa(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA", + headers=crawler_auth_headers, + ) + + data = r.json() + qa = data["qa"] + + assert qa + assert qa["state"] + assert qa["started"] + assert not qa["finished"] + + +def test_qa_list(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa", + headers=crawler_auth_headers, + ) + + data = r.json() + + assert len(data) == 1 + + qa = data[0] + assert qa + assert qa["state"] + assert qa["started"] + assert not qa["finished"] + + +def test_wait_for_complete(crawler_crawl_id, crawler_auth_headers, default_org_id): + count = 0 + completed = False + while count < 24: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA", + headers=crawler_auth_headers, + ) + + data = r.json() + if not data["qa"]: + completed = True + break + + time.sleep(5) + count += 1 + + assert completed + + +def test_qa_completed(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa", + headers=crawler_auth_headers, + ) + + data = r.json() + + assert len(data) == 1 + + qa = data[0] + assert qa + assert qa["state"] == "complete" + assert qa["started"] + assert qa["finished"] + assert qa["stats"]["found"] == 1 + assert qa["stats"]["done"] == 1 + assert qa["crawlExecSeconds"] > 0 + + +def test_qa_org_stats(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}", + headers=crawler_auth_headers, + ) + crawl_stats = r.json() + assert crawl_stats["qaCrawlExecSeconds"] > 0 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}", + headers=crawler_auth_headers, + ) + org_stats = r.json() + + yymm = datetime.utcnow().strftime("%Y-%m") + assert org_stats["qaCrawlExecSeconds"][yymm] > 0 + assert org_stats["qaUsage"][yymm] > 0 + + +def test_qa_page_data(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages", + headers=crawler_auth_headers, + ) + data = r.json() + assert len(data["items"]) == 1 + page = data["items"][0] + assert page["title"] == "Webrecorder" + assert page["url"] == "https://webrecorder.net/" + assert page["qa"]["textMatch"] == 1.0 + assert page["qa"]["screenshotMatch"] == 1.0 + assert page["qa"]["resourceCounts"] == { + "crawlGood": 15, + "crawlBad": 0, + "replayGood": 15, + "replayBad": 1, + } + + +def test_qa_replay(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/replay.json", + headers=crawler_auth_headers, + ) + data = r.json() + assert len(data["resources"]) == 1 + assert data["resources"][0]["path"] + + +def test_run_qa_not_running(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/stop", + headers=crawler_auth_headers, + ) + + assert r.status_code == 400 + assert r.json()["detail"] == "qa_not_running" + + +def test_delete_qa_run(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/delete", + json={"qa_run_ids": [qa_run_id]}, + headers=crawler_auth_headers, + ) + + assert r.status_code == 200 + assert r.json()["deleted"] == True + + # deleted from finished qa list + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa", + headers=crawler_auth_headers, + ) + + assert len(r.json()) == 0 + + # deleted from pages + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages", + headers=crawler_auth_headers, + ) + assert len(r.json()["items"]) == 0 diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index ab42a4c1a9..59720c40dc 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -16,6 +16,8 @@ wacz_content = None +page_id = None + def test_list_orgs(admin_auth_headers, default_org_id): r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers) @@ -280,6 +282,43 @@ def test_update_crawl( assert data["description"] == UPDATED_DESC assert data["name"] == UPDATED_NAME assert data["collectionIds"] == UPDATED_COLLECTION_IDS + assert data.get("reviewStatus") is None + + # Update reviewStatus and verify + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}", + headers=admin_auth_headers, + json={ + "reviewStatus": "good", + }, + ) + assert r.status_code == 200 + data = r.json() + assert data["updated"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["reviewStatus"] == "good" + + # Try to update to invalid reviewStatus + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}", + headers=admin_auth_headers, + json={ + "reviewStatus": "invalid", + }, + ) + assert r.status_code == 422 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["reviewStatus"] == "good" # Verify deleting works as well r = requests.patch( @@ -374,6 +413,213 @@ def test_crawl_stats(crawler_auth_headers, default_org_id): assert row["avg_page_time"] or row["avg_page_time"] == 0 +def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): + # Test GET list endpoint + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] >= 0 + + pages = data["items"] + assert pages + + for page in pages: + assert page["id"] + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + assert page["loadState"] + assert page["status"] + + # Test GET page endpoint + global page_id + page_id = pages[0]["id"] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + page = r.json() + + assert page["id"] == page_id + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + assert page["loadState"] + + assert page["notes"] == [] + assert page.get("userid") is None + assert page.get("modified") is None + assert page.get("approved") is None + + # Update page with approval + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}", + headers=crawler_auth_headers, + json={ + "approved": True, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + page = r.json() + + assert page["id"] == page_id + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + assert page["loadState"] + + assert page["notes"] == [] + assert page["userid"] + assert page["modified"] + assert page["approved"] + + +def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): + # Re-add pages and verify they were correctly added + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["started"] + + time.sleep(10) + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] >= 0 + + pages = data["items"] + assert pages + + for page in pages: + assert page["id"] + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + assert page["loadState"] + assert page["status"] + + # Ensure only superuser can re-add pages for all crawls in an org + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/all/pages/reAdd", + headers=crawler_auth_headers, + ) + assert r.status_code == 403 + + +def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id): + note_text = "testing" + updated_note_text = "updated" + untouched_text = "untouched" + + # Add note + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes", + headers=crawler_auth_headers, + json={"text": note_text}, + ) + assert r.status_code == 200 + assert r.json()["added"] + + # Check that note was added + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert len(data["notes"]) == 1 + + first_note = data["notes"][0] + + first_note_id = first_note["id"] + assert first_note_id + + assert first_note["created"] + assert first_note["userid"] + assert first_note["userName"] + assert first_note["text"] == note_text + + # Add second note to test selective updates/deletes + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes", + headers=crawler_auth_headers, + json={"text": untouched_text}, + ) + assert r.status_code == 200 + assert r.json()["added"] + + # Edit first note + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes", + headers=crawler_auth_headers, + json={"text": updated_note_text, "id": first_note_id}, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Verify notes look as expected + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + notes = data["notes"] + + assert len(notes) == 2 + + updated_note = [note for note in notes if note["id"] == first_note_id][0] + assert updated_note["text"] == updated_note_text + + second_note_id = [note["id"] for note in notes if note["text"] == untouched_text][0] + assert second_note_id + + # Delete both notes + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes/delete", + headers=crawler_auth_headers, + json={"delete_list": [first_note_id, second_note_id]}, + ) + assert r.status_code == 200 + assert r.json()["deleted"] + + # Verify notes were deleted + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + notes = data.get("notes") + assert notes == [] + + def test_delete_crawls_crawler( crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id ): @@ -387,6 +633,14 @@ def test_delete_crawls_crawler( data = r.json() assert data["detail"] == "not_allowed" + # Check that pages exist for crawl + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["total"] > 0 + # Test that crawler user can delete own crawl r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete", @@ -398,6 +652,8 @@ def test_delete_crawls_crawler( assert data["deleted"] == 1 assert data["storageQuotaReached"] is False + time.sleep(5) + # Test that crawl is not found after deleting r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}", @@ -405,6 +661,14 @@ def test_delete_crawls_crawler( ) assert r.status_code == 404 + # Test that associated pages are also deleted + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["total"] == 0 + def test_delete_crawls_org_owner( admin_auth_headers, diff --git a/backend/test/test_settings.py b/backend/test/test_settings.py index 1f4515ac43..a15f012ee5 100644 --- a/backend/test/test_settings.py +++ b/backend/test/test_settings.py @@ -14,5 +14,6 @@ def test_settings(): "jwtTokenLifetime": 86400, "defaultBehaviorTimeSeconds": 300, "maxPagesPerCrawl": 4, + "maxScale": 3, "defaultPageLoadTimeSeconds": 120, } diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 15c477d073..e4249e9b04 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -1,5 +1,6 @@ import requests import os +import time from urllib.parse import urljoin from .conftest import API_PREFIX @@ -934,6 +935,8 @@ def test_delete_form_upload_and_crawls_from_all_crawls( assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size assert data["storageUsedUploads"] == org_upload_bytes - upload_size + time.sleep(10) + r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{all_crawls_delete_config_id}", headers=admin_auth_headers, diff --git a/backend/test/test_utils.py b/backend/test/test_utils.py index a43b3cb7f3..4c523f78f9 100644 --- a/backend/test/test_utils.py +++ b/backend/test/test_utils.py @@ -1,4 +1,5 @@ """utils tests""" + import pytest from btrixcloud.utils import slug_from_name diff --git a/chart/Chart.yaml b/chart/Chart.yaml index 4a994666e7..920172b154 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -5,7 +5,7 @@ type: application icon: https://webrecorder.net/assets/icon.png # Browsertrix Cloud and Chart Version -version: v1.9.0-beta.2 +version: v1.10.0-beta.0 dependencies: - name: btrix-admin-logging diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 84fad5ef92..3255e56f99 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -4,7 +4,7 @@ metadata: name: crawljob-{{ id }} labels: crawl: "{{ id }}" - role: "job" + role: {{ "qa-job" if qa_source else "job" }} btrix.org: "{{ oid }}" btrix.user: "{{ userid }}" btrix.storage: "{{ storage_name }}" @@ -19,11 +19,15 @@ spec: cid: "{{ cid }}" oid: "{{ oid }}" scale: {{ scale }} - maxCrawlSize: {{ max_crawl_size }} - timeout: {{ timeout }} + + maxCrawlSize: {{ max_crawl_size if not qa_source else 0 }} + timeout: {{ timeout if not qa_source else 0 }} + qaSourceCrawlId: "{{ qa_source }}" + manual: {{ manual }} crawlerChannel: "{{ crawler_channel }}" - ttlSecondsAfterFinished: 30 + ttlSecondsAfterFinished: {{ 30 if not qa_source else 0 }} + warcPrefix: "{{ warc_prefix }}" storageName: "{{ storage_name }}" diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index 67d8c58853..e9ea1834d0 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -53,8 +53,11 @@ spec: volumes: - name: crawl-config configMap: + {% if not qa_source_crawl_id %} name: crawl-config-{{ cid }} - + {% else %} + name: qa-replay-{{ qa_source_crawl_id }} + {% endif %} - name: crawl-data persistentVolumeClaim: claimName: {{ name }} @@ -102,6 +105,7 @@ spec: image: {{ crawler_image }} imagePullPolicy: {{ crawler_image_pull_policy }} command: + {% if not qa_source_crawl_id %} - crawl - --config - /tmp/crawl-config.json @@ -112,6 +116,14 @@ spec: - "@{{ profile_filename }}" {%- endif %} + {% else %} + - qa + - --qaSource + - /tmp/crawl-config.json + - --redisStoreUrl + - {{ redis_url }} + - --writePagesToRedis + {% endif %} volumeMounts: - name: crawl-config mountPath: /tmp/crawl-config.json @@ -149,6 +161,9 @@ spec: - name: STORE_USER value: "{{ userid }}" + - name: WARC_PREFIX + value: "{{ warc_prefix }}" + {% if crawler_socks_proxy_host %} - name: SOCKS_HOST value: "{{ crawler_socks_proxy_host }}" diff --git a/chart/app-templates/profilebrowser.yaml b/chart/app-templates/profilebrowser.yaml index 335f705c7d..7c1dab8884 100644 --- a/chart/app-templates/profilebrowser.yaml +++ b/chart/app-templates/profilebrowser.yaml @@ -78,8 +78,8 @@ spec: resources: limits: - memory: "{{ crawler_memory }}" + memory: "{{ profile_memory }}" requests: - cpu: "{{ crawler_cpu }}" - memory: "{{ crawler_memory }}" + cpu: "{{ profile_cpu }}" + memory: "{{ profile_memory }}" diff --git a/chart/app-templates/qa_configmap.yaml b/chart/app-templates/qa_configmap.yaml new file mode 100644 index 0000000000..9fd9e4051b --- /dev/null +++ b/chart/app-templates/qa_configmap.yaml @@ -0,0 +1,14 @@ +# ------- +# CONFIGMAP +# ------- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ name }} + namespace: {{ namespace }} + labels: + crawl: {{ id }} + role: crawler + +data: + crawl-config.json: {{ qa_source_replay_json | tojson }} diff --git a/chart/app-templates/replica_job.yaml b/chart/app-templates/replica_job.yaml index 30870d3fb9..88a7da17b8 100644 --- a/chart/app-templates/replica_job.yaml +++ b/chart/app-templates/replica_job.yaml @@ -13,7 +13,7 @@ spec: template: spec: restartPolicy: Never - priorityClassName: bg-jobs + priorityClassName: bg-job podFailurePolicy: rules: - action: FailJob diff --git a/chart/email-templates/invite b/chart/email-templates/invite index 61a2469d3b..72a8837d85 100644 --- a/chart/email-templates/invite +++ b/chart/email-templates/invite @@ -1,13 +1,13 @@ -Welcome to Browsertrix Cloud! +Welcome to Browsertrix! ~~~

Hello!

-

Welcome to Browsertrix Cloud!

+

Welcome to Browsertrix!

{% if sender %} -

You have been invited by {{ sender }} to join "{{ org_name }}" on Browsertrix Cloud! +

You have been invited by {{ sender }} to join "{{ org_name }}" on Browsertrix!

{% endif %} @@ -22,11 +22,11 @@ Welcome to Browsertrix Cloud!

When you first access your account, you’ll be directed to your Dashboard. It contains information you may want to view frequently including: Storage Usage, Crawling Info, Collections, and Monthly Usage History. From there, you can click + Create New to create your first Crawl Workflow! -

For more info, check out the Browsertrix Cloud User Guide

+

For more info, check out the Browsertrix User Guide

-We want you to get the most from your Browsertrix Cloud experience! +We want you to get the most from your Browsertrix experience!

Let us know if you need any questions or feedback.

@@ -39,10 +39,10 @@ You can connect with our team at {{ support ~~~ Hello! -Welcome to Browsertrix Cloud! +Welcome to Browsertrix! {% if sender %} -You have been invited by {{ sender }} to join their organization, "{{ org_name }}" on Browsertrix Cloud! +You have been invited by {{ sender }} to join their organization, "{{ org_name }}" on Browsertrix! {% else %} @@ -51,13 +51,13 @@ You can join by clicking here: {{ invite_url }} When you first access your account, you’ll be directed to your Dashboard. It contains information you may want to view frequently including: Storage Usage, Crawling Info, Collections, and Monthly Usage History. -For more info, check out Browsertrix Cloud User Guide at: https://docs.browsertrix.cloud/user-guide/ +For more info, check out Browsertrix User Guide at: https://docs.browsertrix.cloud/user-guide/ If you ever need to reset your password, go here: {{ origin }}/log-in/forgot-password -We want you to get the most from your Browsertrix Cloud experience. Let us know if you need any questions or feedback. +We want you to get the most from your Browsertrix experience. Let us know if you need any questions or feedback. You can connect with our team at {{ support_email }}. diff --git a/chart/email-templates/validate b/chart/email-templates/validate index 51bc0fed5b..fd62561fe9 100644 --- a/chart/email-templates/validate +++ b/chart/email-templates/validate @@ -1,6 +1,6 @@ -Welcome to Browsertrix Cloud, Verify your Registration. +Welcome to Browsertrix, Verify your Registration. ~~~ -Please verify your registration for Browsertrix Cloud for {{ receiver_email }} +Please verify your registration for Browsertrix for {{ receiver_email }} You can verify by clicking here: {{ origin }}/verify?token={{ token }} diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml index 1c8ce6653a..dd148d6352 100644 --- a/chart/templates/backend.yaml +++ b/chart/templates/backend.yaml @@ -97,11 +97,10 @@ spec: startupProbe: httpGet: - path: /healthz + path: /healthzStartup port: 8000 - initialDelaySeconds: 5 periodSeconds: 5 - failureThreshold: 30 + failureThreshold: 60 successThreshold: 1 readinessProbe: @@ -119,7 +118,7 @@ spec: port: 8000 initialDelaySeconds: 5 periodSeconds: 30 - failureThreshold: 5 + failureThreshold: 15 successThreshold: 1 - name: op @@ -176,7 +175,7 @@ spec: port: {{ .Values.opPort }} initialDelaySeconds: 5 periodSeconds: 5 - failureThreshold: 30 + failureThreshold: 5 successThreshold: 1 readinessProbe: @@ -194,7 +193,7 @@ spec: port: {{ .Values.opPort }} initialDelaySeconds: 5 periodSeconds: 30 - failureThreshold: 5 + failureThreshold: 15 successThreshold: 1 diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 9eb8d9e422..c19255d9c1 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -12,6 +12,8 @@ data: DEFAULT_NAMESPACE: {{ .Release.Namespace }} + FRONTEND_ORIGIN: {{ .Values.frontend_alias | default "http://browsertrix-cloud-frontend" }} + CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}.svc.cluster.local" DEFAULT_ORG: "{{ .Values.default_org }}" @@ -59,7 +61,7 @@ metadata: data: CRAWL_ARGS: >- - --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --restartsOnError --headless {{ .Values.crawler_extra_args }} + --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }} --- apiVersion: v1 diff --git a/chart/templates/operators.yaml b/chart/templates/operators.yaml index 160e301bde..b7126edb4c 100644 --- a/chart/templates/operators.yaml +++ b/chart/templates/operators.yaml @@ -20,6 +20,11 @@ spec: updateStrategy: method: InPlace + - apiVersion: v1 + resource: configmaps + updateStrategy: + method: OnDelete + hooks: sync: webhook: diff --git a/chart/templates/priorities.yaml b/chart/templates/priorities.yaml index 4b63c15970..9acb5ae8ac 100644 --- a/chart/templates/priorities.yaml +++ b/chart/templates/priorities.yaml @@ -11,13 +11,25 @@ description: "Priority for crawl instance #{{ . }}" {{- end }} +{{- range untilStep 0 (int .Values.max_crawl_scale) 1 }} +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: qa-crawl-instance-{{ . }} +value: -{{ add 100 . }} +globalDefault: false +description: "Priority for QA crawl instance #{{ . }}" + +{{- end }} + # Lower Priority for Background Jobs --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: - name: bg-jobs -value: -100 + name: bg-job +value: -1000 globalDefault: false description: "Priority for background jobs" diff --git a/chart/test/test.yaml b/chart/test/test.yaml index 51bd3842b5..b867e49713 100644 --- a/chart/test/test.yaml +++ b/chart/test/test.yaml @@ -22,7 +22,7 @@ crawler_channels: image: "docker.io/webrecorder/browsertrix-crawler:latest" - id: test - image: "docker.io/webrecorder/browsertrix-crawler:latest" + image: "docker.io/webrecorder/browsertrix-crawler:1.1.0-beta.1" mongo_auth: # specify either username + password (for local mongo) diff --git a/chart/values.yaml b/chart/values.yaml index 63685bea5a..f6c4235724 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -17,8 +17,8 @@ disk_utilization_threshold: 90 # crawler logging flags crawler_logging_opts: "stats,behaviors,debug" -# to enable, set to a value other than 'false' -crawler_extract_full_text: false +# to enable, set to one or more comma separate values: to-warc,to-pages,final-to-warc +crawler_extract_full_text: to-warc # max pages per crawl # set to non-zero value to enforce global max pages per crawl limit @@ -86,7 +86,7 @@ default_org: "My Organization" # API Image # ========================================= -backend_image: "docker.io/webrecorder/browsertrix-backend:1.9.0-beta.2" +backend_image: "docker.io/webrecorder/browsertrix-backend:1.10.0-beta.0" backend_pull_policy: "Always" backend_password_secret: "PASSWORD!" @@ -117,7 +117,7 @@ profile_browser_idle_seconds: 60 # Nginx Image # ========================================= -frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.9.0-beta.2" +frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.10.0-beta.0" frontend_pull_policy: "Always" frontend_cpu: "10m" @@ -130,6 +130,8 @@ frontend_memory: "64Mi" # if using ingress, this value is ignored local_service_port: 30870 +frontend_alias: "http://browsertrix-cloud-frontend" + # MongoDB Image # ========================================= @@ -220,6 +222,11 @@ crawler_extra_memory_per_browser: 768Mi # crawler_memory = crawler_memory_base + crawler_memory_per_extra_browser * (crawler_browser_instances - 1) # crawler_memory: +# optional: defaults to crawler_memory_base and crawler_cpu_base if not set +# profile_browser_memory: +# +# profile_browser_cpu: + # Other Crawler Settings # ---------------------- diff --git a/docs/assets/brand/browsertrix-icon-white.svg b/docs/assets/brand/browsertrix-icon-white.svg new file mode 100644 index 0000000000..f393ca219b --- /dev/null +++ b/docs/assets/brand/browsertrix-icon-white.svg @@ -0,0 +1,10 @@ + + + + + + + + + diff --git a/docs/assets/brand/btrix-logo.svg b/docs/assets/brand/btrix-logo.svg deleted file mode 100644 index 3e6d39e89c..0000000000 --- a/docs/assets/brand/btrix-logo.svg +++ /dev/null @@ -1,29 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/assets/brand/favicon.svg b/docs/assets/brand/favicon.svg new file mode 100644 index 0000000000..ddcbae18aa --- /dev/null +++ b/docs/assets/brand/favicon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/deploy/ansible/digitalocean.md b/docs/deploy/ansible/digitalocean.md index 8fab27901c..03c1b31823 100644 --- a/docs/deploy/ansible/digitalocean.md +++ b/docs/deploy/ansible/digitalocean.md @@ -2,7 +2,7 @@ *Playbook Path: [ansible/playbooks/install_microk8s.yml](https://github.com/webrecorder/browsertrix-cloud/blob/main/ansible/playbooks/do_setup.yml)* -This playbook provides an easy way to install Browsertrix Cloud on DigitalOcean. It automatically sets up Browsertrix with LetsEncrypt certificates. +This playbook provides an easy way to install Browsertrix on DigitalOcean. It automatically sets up Browsertrix with LetsEncrypt certificates. ### Requirements @@ -13,7 +13,7 @@ To run this ansible playbook, you need to: - `doctl` command line client configured (run `doctl auth init`) - Create a [DigitalOcean Spaces](https://docs.digitalocean.com/reference/api/spaces-api/) API Key which will also need to be set in your terminal sessions environment variables, which should be set as `DO_AWS_ACCESS_KEY` and `DO_AWS_SECRET_KEY` - Configure a DNS A Record and CNAME record. -- Have a working python and pip configuration through your OS Package Manager +- Have a working Python and pip configuration through your OS Package Manager #### Install diff --git a/docs/deploy/ansible/k3s.md b/docs/deploy/ansible/k3s.md index 9109d3115d..d73ef4f2f2 100644 --- a/docs/deploy/ansible/k3s.md +++ b/docs/deploy/ansible/k3s.md @@ -2,7 +2,7 @@ *Playbook Path: [ansible/playbooks/install_k3s.yml](https://github.com/webrecorder/browsertrix-cloud/blob/main/ansible/playbooks/install_k3s.yml)* -This playbook provides an easy way to install Browsertrix Cloud on a Linux box (tested on Rocky Linux 9). It automatically sets up Browsertrix with Let's Encrypt certificates. +This playbook provides an easy way to install Browsertrix on a Linux box (tested on Rocky Linux 9). It automatically sets up Browsertrix with Let's Encrypt certificates. ### Requirements @@ -29,7 +29,7 @@ cp -r ansible/inventory/sample-k3s ansible/inventory/my-deployment 2. Change the [hosts IP address](https://github.com/webrecorder/browsertrix-cloud/blob/main/ansible/inventory/sample-k3s/hosts.ini) in your just created inventory -4. You may need to make modifications to the playbook itself based on your configuration. The playbook lists sections that can be removed or changed based on whether you'd like to install a multi-node or single-node k3s installation for your Browsertrix Cloud deployment. By default the playbook assumes you'll run in a single-node environment deploying directly to `localhost` +4. You may need to make modifications to the playbook itself based on your configuration. The playbook lists sections that can be removed or changed based on whether you'd like to install a multi-node or single-node k3s installation for your Browsertrix deployment. By default the playbook assumes you'll run in a single-node environment deploying directly to `localhost` 5. Run the playbook: ```zsh diff --git a/docs/deploy/ansible/microk8s.md b/docs/deploy/ansible/microk8s.md index 547a7d4267..be6687c4bf 100644 --- a/docs/deploy/ansible/microk8s.md +++ b/docs/deploy/ansible/microk8s.md @@ -2,7 +2,7 @@ *Playbook Path: [ansible/playbooks/install_microk8s.yml](https://github.com/webrecorder/browsertrix-cloud/blob/main/ansible/playbooks/install_microk8s.yml)* -This playbook provides an easy way to install Browsertrix Cloud on Ubuntu (tested on Jammy Jellyfish) and RedHat 9 (tested on Rocky Linux 9). It automatically sets up Browsertrix with Letsencrypt certificates. +This playbook provides an easy way to install Browsertrix on Ubuntu (tested on Jammy Jellyfish) and RedHat 9 (tested on Rocky Linux 9). It automatically sets up Browsertrix with Letsencrypt certificates. ### Requirements diff --git a/docs/deploy/customization.md b/docs/deploy/customization.md new file mode 100644 index 0000000000..f145ff7fd3 --- /dev/null +++ b/docs/deploy/customization.md @@ -0,0 +1,87 @@ +# Customizing Browsertrix Deployment + +Local and production deployments alike can be customized by modifying the `chart/values.yaml` Helm chart file or a local override. For more on using local overrides, see the [Local Deployment Guide](local.md). The remainder of this guide covers some of the customization options available in the Helm chart. + +## Default Organization + +The `default_org` setting is used to specify the name for the default organization created in a Browsertrix deployment. A slug will be auto-generated based on this value and can be modified in [Org Settings](../user-guide/org-settings.md) within the application. + +## Superuser + +The `superuser` setting is used to set the username and password for a deployment's superuser. If `password` is left blank, the application will auto-generate a secure password for the superuser. + +## Crawler Channels + +The `crawler_channels` setting is used to specify the [_Crawler Release Channel_](../user-guide/workflow-setup.md#crawler-release-channel) option available to users via dropdown menus in workflows and browser profiles. Each crawler channel has an id and a Docker image tag. These channels are modifiable with the restriction that there must always be one channel with the id `default`. By default this is the only channel available on deployments: + +```yaml +crawler_channels: + - id: default + image: "docker.io/webrecorder/browsertrix-crawler:latest" +``` + +This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`: + +```yaml +crawler_channels: + - id: default + image: "docker.io/webrecorder/browsertrix-crawler:latest" + - id: testing + image: "docker.io/webrecorder/browsertrix-crawler:x.y.z" +``` + +## Storage + +The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, as can be seen in the default configuration: + +```yaml +storages: + - name: "default" + type: "s3" + access_key: "ADMIN" + secret_key: "PASSW0RD" + bucket_name: *local_bucket_name + + endpoint_url: "http://local-minio.default:9000/" +``` + +It is possible to add one or more replica storage locations. If replica locations are enabled, all stored content in the application will be automatically replicated to each configured replica storage location in background jobs after being stored in the default primary storage. If replica locations are enabled, at least one must be set as the default replica location for primary backups. This is indicated with `is_default_replica: True`. If more than one storage location is configured, the primary storage must also be indicated with `is_default_primary: True`. + +For example, here is what a storage configuration with two replica locations, one in another bucket on the same Minio S3 service as primary storage as well as another in an external S3 provider: + +```yaml +storages: + - name: "default" + type: "s3" + access_key: "ADMIN" + secret_key: "PASSW0RD" + bucket_name: *local_bucket_name + + endpoint_url: "http://local-minio.default:9000/" + is_default_primary: True + + - name: "replica-0" + type: "s3" + access_key: "ADMIN" + secret_key: "PASSW0RD" + bucket_name: "replica-0" + + endpoint_url: "http://local-minio.default:9000/" + is_default_replica: True + + - name: "replica-1" + type: "s3" + access_key: "accesskey" + secret_key: "secret" + bucket_name: "replica-1" + + endpoint_url: "http://s3provider.example.com" +``` + +## Email / SMTP Server + +Browsertrix sends user invitations, password resets, background job failure notifications, and other important messages via email. The `email` setting can be used to configure the SMTP server used to send emails. To avoid email messages from Browsertrix being flagged as spam, be sure to use the same domain for `sender_email` and `reply_to_email`. + +## Signing WACZ files + +Browsertrix has the ability to cryptographically sign WACZ files with [Authsign](https://github.com/webrecorder/authsign). The ``signer`` setting can be used to enable this feature and configure Authsign. diff --git a/docs/deploy/index.md b/docs/deploy/index.md index 74b352a8b4..3b0b28f549 100644 --- a/docs/deploy/index.md +++ b/docs/deploy/index.md @@ -1,14 +1,15 @@ -# Deploying Browsertrix Cloud +# Deploying Browsertrix -Browsertrix Cloud is designed to be a cloud-native application running in Kubernetes. +Browsertrix is designed to be a cloud-native application running in Kubernetes. -However, despite the name, it is perfectly reasonable (and easy!) to deploy Browsertrix Cloud locally using one of the many available local Kubernetes options. +However, despite the name, it is perfectly reasonable (and easy!) to deploy Browsertrix locally using one of the many available local Kubernetes options. -The main requirements for Browsertrix Cloud are: +The main requirements for Browsertrix are: - A Kubernetes Cluster - [Helm 3](https://helm.sh/) (package manager for Kubernetes) -We have prepared a [Local Deployment Guide](./local) which covers several options for testing Browsertrix Cloud locally on a single machine, as well as a [Production (Self-Hosted and Cloud) Deployment](./production) guides to help with setting up Browsertrix Cloud for different production scenarios. -Details on managing org export and import for existing clusters can be found in the [Org Import & Export](admin/org-import-export.md) guide. +We have prepared a [Local Deployment Guide](local.md) which covers several options for testing Browsertrix locally on a single machine, as well as a [Production (Self-Hosted and Cloud) Deployment](remote.md) guide to help with setting up Browsertrix in different production scenarios. Information about configuring storage, crawler channels, and other details in local or production deployments is in the [Customizing Browsertrix Deployment Guide](customization.md). + +Details on managing org export and import for existing clusters can be found in the [Org Import & Export](admin/org-import-export.md) guide. \ No newline at end of file diff --git a/docs/deploy/local.md b/docs/deploy/local.md index 8d91096c07..8ab5866173 100644 --- a/docs/deploy/local.md +++ b/docs/deploy/local.md @@ -1,10 +1,10 @@ # Local Deployment -To try out the latest release of Browsertrix Cloud on your local machine, you'll first need to have a working Kubernetes cluster. +To try out the latest release of Browsertrix on your local machine, you'll first need to have a working Kubernetes cluster. ## Installing Kubernetes -Before running Browsertrix Cloud, you'll need to set up a running [Kubernetes](https://kubernetes.io/) cluster. +Before running Browsertrix, you'll need to set up a running [Kubernetes](https://kubernetes.io/) cluster. Today, there are numerous ways to deploy Kubernetes fairly easily, and we recommend trying one of the single-node options, which include Docker Desktop, microk8s, minikube, and k3s. @@ -16,7 +16,7 @@ Here are some environment specific instructions for setting up a local cluster f ??? info "Docker Desktop (recommended for macOS and Windows)" - For macOS and Windows, we recommend testing out Browsertrix Cloud using Kubernetes support in Docker Desktop as that will be one of the simplest options. + For macOS and Windows, we recommend testing out Browsertrix using Kubernetes support in Docker Desktop as that will be one of the simplest options. 1. [Install Docker Desktop](https://www.docker.com/products/docker-desktop/) if not already installed. @@ -54,9 +54,9 @@ Here are some environment specific instructions for setting up a local cluster f 3. Set `KUBECONFIG` to point to the config for K3S: `export KUBECONFIG=/etc/rancher/k3s/k3s.yaml` to ensure Helm will use the correct version. -## Launching Browsertrix Cloud with Helm +## Launching Browsertrix with Helm -Once you have a running Kubernetes cluster with one of the options above, and Helm 3 installed, install the latest release of Browsertrix Cloud directly from the latest GitHub release. +Once you have a running Kubernetes cluster with one of the options above, and Helm 3 installed, install the latest release of Browsertrix directly from the latest GitHub release. @@ -80,7 +80,7 @@ https://github.com/webrecorder/browsertrix-cloud/releases/download/VERSION/brows **Note:** Subsequent commands will also use `microk8s helm3` instead of `helm`. -The default setup includes the full Browsertrix Cloud system, with frontend, backend api, db (via MongoDB), and storage (via Minio) +The default setup includes the full Browsertrix system, with frontend, backend api, db (via MongoDB), and storage (via Minio) An admin user with name `admin@example.com` and password `PASSW0RD!` will be automatically created. @@ -100,7 +100,7 @@ helm upgrade --install btrix https://github.com/webrecorder/browsertrix-cloud/re -f ./chart/examples/local-config.yaml ``` -The above examples assumes running from a cloned Browsertrix Cloud repo, however the config file can be saved anywhere and specified with `-f `. +The above examples assumes running from a cloned Browsertrix repo, however the config file can be saved anywhere and specified with `-f `. ## Waiting for Cluster to Start @@ -117,7 +117,7 @@ REVISION: 1 TEST SUITE: None ``` -After that, especially on first run, it may take a few minutes for the Browsertrix Cloud cluster to start, as all images need to be downloaded locally. +After that, especially on first run, it may take a few minutes for the Browsertrix cluster to start, as all images need to be downloaded locally. You can try running the following command to wait for all pods to be initialized: @@ -127,7 +127,7 @@ kubectl wait --for=condition=ready pod --all --timeout=300s The command will exit when all pods have been loaded, or if there is an error and it times out. -If the command succeeds, you should be able to access Browsertrix Cloud by loading: [http://localhost:30870/](http://localhost:30870/) in your browser. +If the command succeeds, you should be able to access Browsertrix by loading: [http://localhost:30870/](http://localhost:30870/) in your browser. ??? info "Minikube (on macOS)" @@ -152,7 +152,7 @@ The outputs of these commands are helpful when reporting an issue [on GitHub](ht ## Updating the Cluster -To update the cluster, for example to update to new version `NEWVERSION`, re-run the same command again, which will pull the latest images. In this way, you can upgrade to the latest release of Browsertrix Cloud. The upgrade will preserve the database and current archives. +To update the cluster, for example to update to new version `NEWVERSION`, re-run the same command again, which will pull the latest images. In this way, you can upgrade to the latest release of Browsertrix. The upgrade will preserve the database and current archives. ```shell helm upgrade --install btrix https://github.com/webrecorder/browsertrix-cloud/releases/download/NEWVERSION/browsertrix-cloud-NEWVERSION.tgz @@ -172,4 +172,4 @@ To fully delete all persistent data (db + archives) created in the cluster, run ## Deploying for Local Development -These instructions are intended for deploying the cluster from the latest releases published on GitHub. See [setting up cluster for local development](../develop/local-dev-setup.md) for additional customizations related to developing Browsertrix Cloud and deploying from local images. +These instructions are intended for deploying the cluster from the latest releases published on GitHub. See [setting up cluster for local development](../develop/local-dev-setup.md) for additional customizations related to developing Browsertrix and deploying from local images. diff --git a/docs/deploy/remote.md b/docs/deploy/remote.md index 651072c295..6178c0525d 100644 --- a/docs/deploy/remote.md +++ b/docs/deploy/remote.md @@ -2,7 +2,7 @@ For remote and hosted deployments (both on a single machine or in the cloud), the only requirement is to have a designed domain and (strongly recommended, but not required) second domain for signing web archives. -We are also experimenting with [Ansible playbooks](../deploy/ansible) for cloud deployment setups. +We are also experimenting with [Ansible playbooks](ansible/digitalocean.md) for cloud deployment setups. The production deployments also allow using an external mongodb server, and/or external S3-compatible storage instead of the bundled minio. @@ -94,15 +94,15 @@ mongo_auth: ## Cloud Deployment -There are also many ways to deploy Browsertrix Cloud on various cloud providers. +There are also many ways to deploy Browsertrix on various cloud providers. -To simplify this process, we are working on Ansible playbooks for setting up Browsertrix Cloud on commonly used infrastructure. +To simplify this process, we are working on Ansible playbooks for setting up Browsertrix on commonly used infrastructure. ### Ansible Deployment -[Ansible](https://ansible.com) makes the initial setup and configuration of your Browsertrix Cloud instance automated and repeatable. +[Ansible](https://ansible.com) makes the initial setup and configuration of your Browsertrix instance automated and repeatable. -To use, you will need to [install Ansible](https://docs.ansible.com/ansible/latest/installation_guide/intro_installation.html#control-node-requirements) on your control computer and then you can use these to deploy to Browsertrix Cloud on remote and cloud environments. +To use, you will need to [install Ansible](https://docs.ansible.com/ansible/latest/installation_guide/intro_installation.html#control-node-requirements) on your control computer and then you can use these to deploy to Browsertrix on remote and cloud environments. Currently, we provide playbooks for the following tested environments: diff --git a/docs/develop/docs.md b/docs/develop/docs.md index 6e39ab1aa8..0c1bb5cb78 100644 --- a/docs/develop/docs.md +++ b/docs/develop/docs.md @@ -129,6 +129,16 @@ For in-line code blocks, syntax highlighting should be added for all code-relate Renders to: `#!python range()` +### Paid features + +`Paid Feature`{ .badge-green } + +Some features of Browsertrix only pertain to those paying for the software on a hosted plan. Denote these with the following: + +```markdown +`Paid Feature`{ .badge-green } +``` + ### Admonitions We use [Admonitions](https://squidfunk.github.io/mkdocs-material/reference/admonitions/) in their collapsed state to offer additional context or tips that aren't relevant to all users reading the section. We use standard un-collapsible ones when we need to call attention to a specific point. diff --git a/docs/develop/frontend-dev.md b/docs/develop/frontend-dev.md index 5bacf20933..61d255ec0e 100644 --- a/docs/develop/frontend-dev.md +++ b/docs/develop/frontend-dev.md @@ -1,16 +1,16 @@ # Developing the Frontend UI -This guide explains how to run the Browsertrix Cloud frontend development server with [Yarn](https://classic.yarnpkg.com). +This guide explains how to run the Browsertrix frontend development server with [Yarn](https://classic.yarnpkg.com). Instead of rebuilding the entire frontend image to view your UI changes, you can use the included local development server to access the frontend from your browser. This setup is ideal for rapid UI development that does not rely on any backend changes. ## Requirements -### 1. Browsertrix Cloud API backend already in a Kubernetes cluster +### 1. Browsertrix API backend already in a Kubernetes cluster -The frontend development server requires an existing backend that has been deployed locally or is in production. See [Deploying Browsertrix Cloud](../../deploy/). +The frontend development server requires an existing backend that has been deployed locally or is in production. See [Deploying Browsertrix](../deploy/index.md). -### 2. Node.js ≥16 and Yarn 1 +### 2. Node.js ≥18 and Yarn 1 To check if you already have Node.js installed, run the following command in your command line terminal: @@ -64,7 +64,7 @@ API_BASE_URL=http://dev.example.com !!! note - This setup assumes that your API endpoints are available under `/api`, which is the default configuration for the Browsertrix Cloud backend. + This setup assumes that your API endpoints are available under `/api`, which is the default configuration for the Browsertrix backend. If connecting to a local deployment cluster, set `API_BASE_URL` to: diff --git a/docs/develop/index.md b/docs/develop/index.md index 47a54c26cf..5228f16ae0 100644 --- a/docs/develop/index.md +++ b/docs/develop/index.md @@ -1,19 +1,19 @@ --- hide: - - toc + - toc --- -# Developing Browsertrix Cloud +# Developing Browsertrix -Browsertrix Cloud consists of a Python-based backend and TypeScript-based frontend. +Browsertrix consists of a Python-based backend and TypeScript-based frontend. -To develop Browsertrix Cloud, the system must [first be deployed locally](../deploy/local.md) in a Kubernetes cluster. +To develop Browsertrix, the system must [first be deployed locally](../deploy/local.md) in a Kubernetes cluster. The deployment can then be [further customized for local development](./local-dev-setup.md). ### Backend -The backend is an API-only system, using the FastAPI framework. The latest API reference is available under ./api of a running cluster. +The backend is an API-only system, using the FastAPI framework. Latest API docs can be viewed in the browser by adding `/api/redoc` to the URL of a running cluster (ex: `http://localhost:30870/api/redoc` when running locally on port `30870`.) At this time, the backend must be deployed in the Kubernetes cluster. @@ -25,6 +25,6 @@ The frontend UI is implemented in TypeScript, using the Lit framework and Shoela The static build of the frontend is bundled with nginx, but the frontend can be deployed locally in dev mode against an existing backend. -See [Running Frontend](./frontend-dev) for more details. +See [Developing the Frontend UI](frontend-dev.md) for more details. diff --git a/docs/develop/local-dev-setup.md b/docs/develop/local-dev-setup.md index 06d9b7fccc..dd0def5963 100644 --- a/docs/develop/local-dev-setup.md +++ b/docs/develop/local-dev-setup.md @@ -6,7 +6,7 @@ First, see our [Local Deployment guide](../deploy/local.md#installing-kubernetes ## Local Dev Configuration -The local deployment guide explains how to deploy Browsertrix Cloud with latest published images. +The local deployment guide explains how to deploy Browsertrix with latest published images. However, if you are developing locally, you will need to use your local images instead. diff --git a/docs/index.md b/docs/index.md index 8306f33aee..0341533d48 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,13 +6,12 @@ hide: # Home -Welcome to the Browsertrix Cloud official user guide and developer docs. These docs will contain the following sections. +Welcome to the Browsertrix official user guide and developer docs. These docs will contain the following sections. -- [Deployment Guide](./deploy) — How to install and deploy Browsertrix Cloud on your local machine, or in the cloud. -- [Developer Docs](./develop) — Information on developing Browsertrix Cloud itself. -- [User Guide](./user-guide) — Instructions and reference for using Browsertrix Cloud. +- [Deployment Guide](deploy/index.md) — How to install and deploy Browsertrix on your local machine, or in the cloud. +- [Developer Docs](develop/index.md) — Information on developing Browsertrix itself. +- [User Guide](user-guide/index.md) — Instructions and reference for using Browsertrix. -If you are unfamiliar with Browsertrix Cloud, please check out [our website](https://browsertrix.cloud), or the main repository at [https://github.com/webrecorder/browsertrix-cloud](https://github.com/webrecorder/browsertrix-cloud) - -Our docs are still under construction. If you find something missing, chances are we haven't gotten around to writing that part yet. If you find typos or something isn't clear or seems incorrect, please open an [issue](https://github.com/webrecorder/browsertrix-cloud/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc) and we'll try to make sure that your questions get answered here in the future! +If you are unfamiliar with Browsertrix, please check out [our website](https://browsertrix.cloud), or the main repository at [https://github.com/webrecorder/browsertrix-cloud](https://github.com/webrecorder/browsertrix-cloud) +If something is missing, unclear, or seems incorrect, please open an [issue](https://github.com/webrecorder/browsertrix-cloud/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc) and we'll try to make sure that your questions get answered here in the future! diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index f43949e8a0..35a5a51a23 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -29,17 +29,17 @@ :root { --md-code-font: "Recursive", monospace; --md-text-font: "Inter", "Helvetica", "Arial", sans-serif; - --wr-blue-primary: #2E7CAB; - --wr-orange-primary: #B85D20; + --wr-blue-primary: #0891B2; + --wr-orange-primary: #C96509; } [data-md-color-scheme="webrecorder"] { - --md-primary-fg-color: #008873; - --md-primary-fg-color--light: #008873; - --md-primary-fg-color--dark: #003c32; + --md-primary-fg-color: #4D7C0F; + --md-primary-fg-color--light: #0782A1; + --md-primary-fg-color--dark: #066B84; --md-typeset-color: black; - --md-accent-fg-color: #01b297; - --md-typeset-a-color: #005447; + --md-accent-fg-color: #0782A1; + --md-typeset-a-color: #066B84; --md-code-bg-color: #F9FAFB; } @@ -108,6 +108,14 @@ code { font-weight: 600; } +.badge-green { + background-color: hsl(142 76% 36%) !important; + border-color: hsl(142 76% 36%) !important; + color: white !important; + font-family: var(--md-text-font); + font-weight: 600; +} + .badge-orange { background-color: var(--wr-orange-primary) !important; border-color: var(--wr-orange-primary) !important; diff --git a/docs/user-guide/archived-items.md b/docs/user-guide/archived-items.md index 483503b5d4..97d93d83e1 100644 --- a/docs/user-guide/archived-items.md +++ b/docs/user-guide/archived-items.md @@ -12,7 +12,7 @@ The status of an archived item depends on its type. Uploads will always have the | Status | Description | | ---- | ---- | -| :bootstrap-check-circle: Complete | The crawl completed according to the workflow's settings. Workflows with [limits](../workflow-setup/#limits) set may stop running before they capture every queued page, but the resulting archived item will still be marked as "Complete". | +| :bootstrap-check-circle: Complete | The crawl completed according to the workflow's settings. Workflows with [limits](workflow-setup.md#limits) set may stop running before they capture every queued page, but the resulting archived item will still be marked as "Complete". | | :bootstrap-dash-circle: Stopped | The crawl workflow was _stopped_ gracefully by a user and data is saved. | | :bootstrap-x-octagon: Canceled | The crawl workflow was _canceled_ by a user, no data is saved. | | :bootstrap-exclamation-triangle: Failed | A serious error occurred while crawling, no data is saved.| diff --git a/docs/user-guide/browser-profiles.md b/docs/user-guide/browser-profiles.md index 8756e971d6..914179fa2f 100644 --- a/docs/user-guide/browser-profiles.md +++ b/docs/user-guide/browser-profiles.md @@ -1,6 +1,6 @@ # Browser Profiles -Browser Profiles are saved instances of a web browsing session that can be reused to crawl websites as they were configured, with any cookies or saved login sessions. Using a pre-configured profile also means that content that can only be viewed by logged in users can be archived, without archiving the actual login credentials. +Browser Profiles are saved instances of a web browsing session that can be reused to crawl websites as they were configured, with any cookies, saved login sessions, or browser settings. Using a pre-configured profile also means that content that can only be viewed by logged in users can be archived, without archiving the actual login credentials. !!! tip "Best practice: Create and use web archiving-specific accounts for crawling with browser profiles" @@ -14,17 +14,38 @@ Browser Profiles are saved instances of a web browsing session that can be reuse - Due to nature of social media specifically, existing accounts may have personally identifiable information, even when accessing otherwise public content. - Of course, there are exceptions — such as when the goal is to archive personalized or private content accessible only from designated accounts. + Of course, there are exceptions — such as when the goal is to archive personalized or private content accessible only from designated accounts. In these instances we recommend changing the account's password after crawling is complete. ## Creating New Browser Profiles -New browser profiles can be created on the Browser Profiles page by pressing the _New Browser Profile_ button and providing a starting URL. Once in the profile creator, log in to any websites that should behave as logged in while crawling and accept any pop-ups that require interaction from the user to proceed with using the website. +New browser profiles can be created on the Browser Profiles page by pressing the _New Browser Profile_ button and providing a starting URL. Press the _Finish Browsing_ button to save the browser profile with a _Name_ and _Description_ of what is logged in or otherwise notable about this browser session. +### Logging into Websites + +To crawl content as a logged in user, log into the website you wish to archive as you would on any other browser. Once the account has been logged in, that's it! + +### Accepting Popups + +Some websites are required to get informed consent from users to track them, others require their users to verify their age before viewing adult content. Websites often choose to use cookies — small pieces of configuration data stored in the browser — to store this information alongside other cookies such as a login session. Interacting with popups that store the user's choices in a cookie will in turn store those cookies within the browser profile. Like everything else those cookie values will be used when crawling with the browser profile. + +### Changing Browser Settings + +Browser profiles don't just affect websites! Any of Brave's settings (available at the URL `brave://settings/`) set in the profile creator will be used while crawling. + +??? example "Example: Blocking page resources with Brave's Shields" + Whereas the crawler's scoping settings can be used to define which pages should be crawled, Brave's [Shields](https://brave.com/shields/) feature can block resources on pages from being loaded. By default, Shields will block [EasyList's cookie list](https://easylist.to/) but it can be set to block a number of other included lists under Brave `Settings > Shields > Filter Lists`. + + _Custom Filters_ can also be useful for blocking sites with resources that aren't blocked by one of the existing lists. We use this at Webrecorder to block our web analytics script while crawling our own website by adding `stats.browsertrix.com` to the filter list. In this example, `browsertrix.com` will still load, but Brave will block any communication to `stats.browsertrix.com` and our analytics won't register a page view as a result. While lots of common analytics tools may already be blocked in an existing blocklist, this one likely isn't because we run it ourselves! + + The [Ublock Origin filter syntax](https://github.com/gorhill/uBlock/wiki/Static-filter-syntax) can be used for more specificity over what in-page resources should be blocked. + + All browser setting related blocking features can be used in addition with the [_Block Ads by Domain_](workflow-setup.md#block-ads-by-domain) crawler setting. + ## Editing Existing Browser Profiles -Sometimes websites will log users out or expire cookies after a period of time. In these cases, when crawling the browser profile can still be loaded but may not behave as it did when it was initially set up. +Sometimes websites will log users out or expire cookies or login sessions after a period of time. In these cases, when crawling the browser profile can still be loaded but may not behave as it did when it was initially set up. To update the profile, go to the profile's details page and press the _Edit Browser Profile_ button to load and interact with the sites that need to be re-configured. When finished, press the _Save Browser Profile_ button to return to the profile's details page. diff --git a/docs/user-guide/collections.md b/docs/user-guide/collections.md index 4178a5b3aa..2b9e163939 100644 --- a/docs/user-guide/collections.md +++ b/docs/user-guide/collections.md @@ -11,7 +11,7 @@ Collections are the primary way of organizing and combining archived items into Crawls and uploads can be added to a collection after creation by selecting _Select Archived Items_ from the collection's actions menu. -A crawl workflow can also be set to [automatically add any completed archived items to a collection](../workflow-setup/#collection-auto-add) in the workflow's settings. +A crawl workflow can also be set to [automatically add any completed archived items to a collection](workflow-setup.md#collection-auto-add) in the workflow's settings. ## Sharing Collections diff --git a/docs/user-guide/crawl-workflows.md b/docs/user-guide/crawl-workflows.md index 4a148d3906..2b319f9f23 100644 --- a/docs/user-guide/crawl-workflows.md +++ b/docs/user-guide/crawl-workflows.md @@ -4,11 +4,11 @@ Crawl Workflows consist of a list of configuration options that instruct the cra ## Creating and Editing Crawl Workflows -New Crawl Workflows can be created from the Crawling page. A detailed breakdown of available settings can be found [here](../workflow-setup). +New Crawl Workflows can be created from the Crawling page. A detailed breakdown of available settings can be found [here](workflow-setup.md). ## Status -Crawl Workflows inherit the [status of the last item they created](../archived-items/#status). When a workflow has been instructed to run it can have have five possible states: +Crawl Workflows inherit the [status of the last item they created](archived-items.md#status). When a workflow has been instructed to run it can have have five possible states: | Status | Description | | ---- | ---- | @@ -25,11 +25,11 @@ Crawl workflows can be run from the actions menu of the workflow in the crawl wo While crawling, the Watch Crawl page displays a list of queued URLs that will be visited, and streams the current state of the browser windows as they visit pages from the queue. -Running a crawl workflow that has successfully run previously can be useful to capture content as it changes over time, or to run with an updated [Crawl Scope](../workflow-setup/#scope). +Running a crawl workflow that has successfully run previously can be useful to capture content as it changes over time, or to run with an updated [Crawl Scope](workflow-setup.md#scope). ### Live Exclusion Editing -While [exclusions](../workflow-setup/#exclusions) can be set before running a crawl workflow, sometimes while crawling the crawler may find new parts of the site that weren't previously known about and shouldn't be crawled, or get stuck browsing parts of a website that automatically generate URLs known as ["crawler traps"](https://en.wikipedia.org/wiki/Spider_trap). +While [exclusions](workflow-setup.md#exclusions) can be set before running a crawl workflow, sometimes while crawling the crawler may find new parts of the site that weren't previously known about and shouldn't be crawled, or get stuck browsing parts of a website that automatically generate URLs known as ["crawler traps"](https://en.wikipedia.org/wiki/Spider_trap). If the crawl queue is filled with URLs that should not be crawled, use the _Edit Exclusions_ button on the Watch Crawl page to instruct the crawler what pages should be excluded from the queue. @@ -37,7 +37,7 @@ Exclusions added while crawling are applied to the same exclusion table saved in ### Changing the Amount of Crawler Instances -Like exclusions, the [crawler instance](../workflow-setup/#crawler-instances) scale can also be adjusted while crawling. On the Watch Crawl page, press the _Edit Crawler Instances_ button, and set the desired value. +Like exclusions, the [crawler instance](workflow-setup.md#crawler-instances) scale can also be adjusted while crawling. On the Watch Crawl page, press the _Edit Crawler Instances_ button, and set the desired value. Unlike exclusions, this change will not be applied to future workflow runs. diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md index 2ece94a8b9..3ca87273bf 100644 --- a/docs/user-guide/index.md +++ b/docs/user-guide/index.md @@ -6,8 +6,8 @@ Welcome to the Browsertrix User Guide. This page covers the basics of using Brow To get started crawling with Browsertrix: -1. Create an account and join an organization [as described here](signup). -2. After being redirected to the organization's [overview page](overview), click the _Create New_ button in the top right and select _[Crawl Workflow](crawl-workflows)_ to begin configuring your first crawl! +1. Create an account and join an organization [as described here](signup.md). +2. After being redirected to the organization's [overview page](overview.md), click the _Create New_ button in the top right and select _[Crawl Workflow](crawl-workflows.md)_ to begin configuring your first crawl! 3. For a simple crawl, choose the _Seeded Crawl_ option, and enter a page url in the _Crawl Start URL_ field. By default, the crawler will archive all pages under the starting path. 4. Next, click _Review & Save_, and ensure the _Run on Save_ option is selected. Then click _Save Workflow_. 5. Wait a moment for the crawler to start and watch as it archives the website! @@ -16,12 +16,12 @@ To get started crawling with Browsertrix: After running your first crawl, check out the following to learn more about Browsertrix's features: -- A detailed list of [crawl workflow setup](workflow-setup) options. -- Adding [exclusions](workflow-setup/#exclusions) to limit your crawl's scope and evading crawler traps by [editing exclusion rules while crawling](crawl-workflows/#live-exclusion-editing). -- Best practices for crawling with [browser profiles](browser-profiles) to capture content only available when logged in to a website. -- Managing archived items, including [uploading previously archived content](archived-items/#uploading-web-archives). -- Organizing and combining archived items with [collections](collections) for sharing and export. -- If you're an admin: [Inviting collaborators to your org](org-settings/#members). +- A detailed list of [crawl workflow setup](workflow-setup.md) options. +- Adding [exclusions](workflow-setup.md#exclusions) to limit your crawl's scope and evading crawler traps by [editing exclusion rules while crawling](crawl-workflows.md#live-exclusion-editing). +- Best practices for crawling with [browser profiles](browser-profiles.md) to capture content only available when logged in to a website. +- Managing archived items, including [uploading previously archived content](archived-items.md#uploading-web-archives). +- Organizing and combining archived items with [collections](collections.md) for sharing and export. +- If you're an admin: [Inviting collaborators to your org](org-settings.md#members). ### Have more questions? diff --git a/docs/user-guide/overview.md b/docs/user-guide/overview.md index 895e0834d3..04517459ba 100644 --- a/docs/user-guide/overview.md +++ b/docs/user-guide/overview.md @@ -12,9 +12,16 @@ For all organizations the storage panel displays the total number of archived it ## Crawling +The crawling panel lists the number of currently running and waiting crawls, as well as the total number of pages captured. + +### Execution Time + +`Paid Feature`{.badge-green} + For organizations with a set execution minute limit, the crawling panel displays a graph of how much execution time has been used and how much is currently remaining. Monthly execution time limits reset on the first of each month at 12:00 AM GMT. -The crawling panel also lists the number of currently running and waiting crawls, as well as the total number of pages captured. +??? Question "How is execution time calculated?" + Execution time is the total runtime of all [_Crawler Instances_](workflow-setup.md/#crawler-instances) during a crawl. For instance, if _Crawler Instances_ scale is set to 2× and each crawler instance uses 2 minutes of active crawling time, execution time for the crawl will be 4 minutes. Like elapsed time, this is tracked as the crawl runs so changing the _Crawler Instances_ scale while a crawl is running may change the amount of execution time used in a given time period. ## Collections diff --git a/docs/user-guide/signup.md b/docs/user-guide/signup.md index be4dc28636..22c98f3b2b 100644 --- a/docs/user-guide/signup.md +++ b/docs/user-guide/signup.md @@ -2,7 +2,7 @@ ## Invite Link -If you have been sent an [invite](../org-settings/#members), enter a name and password to create a new account. Your account will be added to the organization you were invited to by an organization admin. +If you have been sent an [invite](org-settings.md#members), enter a name and password to create a new account. Your account will be added to the organization you were invited to by an organization admin. ## Open Registration diff --git a/docs/user-guide/workflow-setup.md b/docs/user-guide/workflow-setup.md index ff7670074b..0b51fa1c29 100644 --- a/docs/user-guide/workflow-setup.md +++ b/docs/user-guide/workflow-setup.md @@ -2,7 +2,7 @@ ## Crawl Type -The first step in creating a new [crawl workflow](../crawl-workflows) is to choose what type of crawl you want to run. Crawl types are fixed and cannot be converted or changed later. +The first step in creating a new [crawl workflow](crawl-workflows.md) is to choose what type of crawl you want to run. Crawl types are fixed and cannot be converted or changed later. `URL List`{ .badge-blue } : The crawler visits every URL specified in a list, and optionally every URL linked on those pages. @@ -120,7 +120,7 @@ Adds a hard limit on the number of pages that will be crawled. The crawl will be ### Crawl Time Limit -The crawl will be gracefully stopped after this set period of time. +The crawl will be gracefully stopped after this set period of elapsed time. ### Crawl Size Limit @@ -132,11 +132,11 @@ Increasing the amount of crawler instances will speed up crawls by using additio ### Page Load Timeout -Limits amount of time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded. +Limits amount of elapsed time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded. ### Behavior Timeout -Limits how long behaviors can run on each page. +Limits amount of elapsed time behaviors have to complete. ### Auto Scroll Behavior @@ -144,13 +144,13 @@ When enabled, the browser will automatically scroll to the end of the page. ### Delay Before Next Page -Waits on the page for a set period of time after any behaviors have finished running. This can be helpful to avoid rate limiting however it will slow down your crawl. +Waits on the page for a set period of elapsed time after any behaviors have finished running. This can be helpful to avoid rate limiting however it will slow down your crawl. ## Browser Settings ### Browser Profile -Sets the [_Browser Profile_](../browser-profiles) to be used for this crawl. +Sets the [_Browser Profile_](browser-profiles.md) to be used for this crawl. ### Crawler Release Channel @@ -219,4 +219,4 @@ Apply tags to the workflow. Tags applied to the workflow will propagate to every ### Collection Auto-Add -Search for and specify [collections](../collections) that this crawl workflow should automatically add content to as soon as crawling finishes. Canceled and Failed crawls will not be automatically added to collections. +Search for and specify [collections](collections.md) that this crawl workflow should automatically add content to as soon as crawling finishes. Canceled and Failed crawls will not be automatically added to collections. diff --git a/frontend/.eslintrc.js b/frontend/.eslintrc.js index 150803332b..0b6f92d5d6 100644 --- a/frontend/.eslintrc.js +++ b/frontend/.eslintrc.js @@ -7,9 +7,11 @@ module.exports = { es2017: true, }, extends: [ + "eslint:recommended", + "plugin:@typescript-eslint/eslint-recommended", + "plugin:import-x/recommended", "plugin:wc/recommended", "plugin:lit/recommended", - "plugin:@typescript-eslint/recommended", "prettier", ], plugins: ["@typescript-eslint", "lit"], @@ -19,34 +21,145 @@ module.exports = { }, root: true, rules: { + /* start stylistic rules */ + "@typescript-eslint/adjacent-overload-signatures": "error", + "@typescript-eslint/array-type": "error", + "@typescript-eslint/consistent-type-imports": [ + "error", + { + fixStyle: "inline-type-imports", + }, + ], + "@typescript-eslint/consistent-type-exports": "error", + "@typescript-eslint/prefer-readonly": "warn", + "@typescript-eslint/class-literal-property-style": ["warn", "getters"], + "@typescript-eslint/consistent-generic-constructors": "error", + "@typescript-eslint/consistent-type-assertions": "error", + "@typescript-eslint/no-confusing-non-null-assertion": "warn", + "@typescript-eslint/no-inferrable-types": "warn", + "@typescript-eslint/non-nullable-type-assertion-style": "warn", + "@typescript-eslint/prefer-for-of": "warn", + // "@typescript-eslint/prefer-nullish-coalescing": "warn", + "@typescript-eslint/prefer-optional-chain": "warn", + "@typescript-eslint/prefer-string-starts-ends-with": "error", + "@typescript-eslint/no-meaningless-void-operator": "error", + "@typescript-eslint/no-unnecessary-boolean-literal-compare": "warn", + "@typescript-eslint/no-unnecessary-condition": "warn", + "@typescript-eslint/no-unnecessary-qualifier": "warn", + "@typescript-eslint/no-unnecessary-type-arguments": "warn", + "@typescript-eslint/prefer-reduce-type-parameter": "warn", + "@typescript-eslint/promise-function-async": "warn", + /* end stylistic rules */ + + /* start recommended rules */ "no-restricted-globals": [2, "event", "error"], + "@typescript-eslint/no-base-to-string": "warn", + "@typescript-eslint/no-duplicate-enum-values": "error", + "@typescript-eslint/no-duplicate-type-constituents": "warn", + "@typescript-eslint/no-explicit-any": "error", + "@typescript-eslint/no-extra-non-null-assertion": "error", + "@typescript-eslint/no-floating-promises": "warn", + "@typescript-eslint/no-for-in-array": "warn", "no-unused-vars": "off", "@typescript-eslint/no-unused-vars": [ - "warn", + "error", { argsIgnorePattern: "^_", varsIgnorePattern: "^_", destructuredArrayIgnorePattern: "^_", }, ], - "@typescript-eslint/consistent-type-imports": [ + "no-implied-eval": "off", + "@typescript-eslint/no-implied-eval": "error", + "no-loss-of-precision": "off", + "@typescript-eslint/no-loss-of-precision": "warn", + "@typescript-eslint/no-misused-new": "error", + "@typescript-eslint/no-misused-promises": [ + "error", + { checksVoidReturn: false }, + ], + "@typescript-eslint/no-non-null-asserted-nullish-coalescing": "error", + "@typescript-eslint/no-non-null-asserted-optional-chain": "warn", + "@typescript-eslint/no-redundant-type-constituents": "warn", + "@typescript-eslint/no-this-alias": "warn", + "@typescript-eslint/no-unnecessary-type-assertion": "warn", + "@typescript-eslint/no-unnecessary-type-constraint": "warn", + /* TODO eventually turn all these on */ + "@typescript-eslint/no-unsafe-argument": "warn", + // "@typescript-eslint/no-unsafe-assignment": "warn", + // "@typescript-eslint/no-unsafe-call": "warn", + "@typescript-eslint/no-unsafe-declaration-merging": "warn", + "@typescript-eslint/no-unsafe-enum-comparison": "warn", + // "@typescript-eslint/no-unsafe-member-access": "warn", + "@typescript-eslint/no-unsafe-return": "warn", + "@typescript-eslint/prefer-as-const": "warn", + "require-await": "off", + // "@typescript-eslint/require-await": "warn", + "@typescript-eslint/restrict-template-expressions": "warn", + "@typescript-eslint/unbound-method": "off", + "@typescript-eslint/method-signature-style": "error", + /* end recommended rules */ + + /* start import rules */ + // "import-x/no-duplicates": ["error", { "prefer-inline": true }], + "import-x/order": [ "error", { - fixStyle: "inline-type-imports", + "newlines-between": "always", + pathGroups: [ + { + pattern: "@/*", + group: "internal", + }, + { + pattern: "~assets/*", + group: "internal", + }, + ], + distinctGroup: false, + alphabetize: { + order: "asc", + caseInsensitive: true, + }, }, ], - "@typescript-eslint/consistent-type-exports": "error", - "@typescript-eslint/no-explicit-any": "warn", + "import-x/no-relative-packages": "error", + "import-x/no-useless-path-segments": [ + "error", + { + noUselessIndex: true, + }, + ], + "import-x/no-cycle": "error", }, reportUnusedDisableDirectives: true, - ignorePatterns: ["__generated__", "__mocks__"], + settings: { + "import-x/resolver": { + typescript: true, + }, + }, + ignorePatterns: ["__generated__", "__mocks__", "dist"], overrides: [ { extends: ["plugin:@typescript-eslint/disable-type-checked"], - files: ["webpack.*.js"], + files: [ + "webpack.*.js", + "config/*.js", + "scripts/*.js", + ".*.js", + "*.config.js", + ], + env: { node: true }, rules: { "@typescript-eslint/no-var-requires": "off", }, }, + { + files: ["*.test.ts"], + rules: { + "@typescript-eslint/no-floating-promises": "off", + "@typescript-eslint/no-unsafe-call": "off", + }, + }, ], }; diff --git a/frontend/.prettierignore b/frontend/.prettierignore index 715e885578..2cc422d4f7 100644 --- a/frontend/.prettierignore +++ b/frontend/.prettierignore @@ -1,4 +1,5 @@ __generated__ __mocks__ xliff -assets \ No newline at end of file +assets +dist diff --git a/frontend/Dockerfile b/frontend/Dockerfile index e3ce800898..fd3f435e9e 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -1,21 +1,21 @@ # syntax=docker/dockerfile:1.4 -FROM --platform=$BUILDPLATFORM docker.io/library/node:16 as build_deps +FROM --platform=$BUILDPLATFORM docker.io/library/node:18 as build_deps WORKDIR /app COPY yarn.lock package.json ./ # Uses `yarn cache clean` to let Docker cache layer instead # of including yarn cache in the build image RUN yarn --production --frozen-lockfile --ignore-optional --network-timeout 1000000 && \ - yarn cache clean + yarn cache clean COPY --link lit-localize.json \ - postcss.config.js \ - tailwind.config.js \ - tsconfig.json \ - webpack.config.js \ - webpack.prod.js \ - index.d.ts \ - ./ + postcss.config.js \ + tailwind.config.js \ + tsconfig.json \ + webpack.config.js \ + webpack.prod.js \ + index.d.ts \ + ./ COPY --link src ./src/ @@ -27,8 +27,8 @@ ARG GIT_BRANCH_NAME ARG VERSION ENV GIT_COMMIT_HASH=${GIT_COMMIT_HASH} \ - GIT_BRANCH_NAME=${GIT_BRANCH_NAME} \ - VERSION=${VERSION} + GIT_BRANCH_NAME=${GIT_BRANCH_NAME} \ + VERSION=${VERSION} # Prevent Docker image including node_modules to save space RUN yarn build && \ diff --git a/frontend/config/dev-server.js b/frontend/config/dev-server.js index 4d5df5af15..05d0997747 100644 --- a/frontend/config/dev-server.js +++ b/frontend/config/dev-server.js @@ -1,4 +1,4 @@ -// eslint-disable-next-line @typescript-eslint/no-var-requires +/* eslint-env node */ const path = require("path"); require(path.resolve(process.cwd(), "./webpack.config.js")); @@ -8,7 +8,7 @@ const RWP_BASE_URL = if (!process.env.API_BASE_URL) { throw new Error( - "To run a dev frontend server, please set the API_BASE_URL pointing to your backend api server in '.env.local'" + "To run a dev frontend server, please set the API_BASE_URL pointing to your backend api server in '.env.local'", ); } diff --git a/frontend/index.d.ts b/frontend/index.d.ts index 90231e4355..57f96d4072 100644 --- a/frontend/index.d.ts +++ b/frontend/index.d.ts @@ -2,3 +2,8 @@ declare module "*.svg"; declare module "*.webp"; declare module "*.css"; declare module "regex-colorize"; + +/** + * Flattens to a normal string type, but preserves string literal suggestions + */ +type AnyString = string & {}; diff --git a/frontend/package.json b/frontend/package.json index 0a4a894939..5cc3f57068 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,19 +1,20 @@ { "name": "browsertrix-frontend", - "version": "1.9.0-beta.2", + "version": "1.10.0-beta.0", "main": "index.ts", "license": "AGPL-3.0-or-later", "dependencies": { "@cheap-glitch/mi-cron": "^1.0.1", + "@ianvs/prettier-plugin-sort-imports": "^4.2.1", "@lit/localize": "^0.12.1", "@novnc/novnc": "^1.4.0-beta", "@rollup/plugin-commonjs": "^18.0.0", - "@shoelace-style/shoelace": "^2.8.0", + "@shoelace-style/shoelace": "~2.10.0", "@types/color": "^3.0.2", "@types/lodash": "^4.14.178", "@types/sinon": "^10.0.6", - "@typescript-eslint/eslint-plugin": "^6.12.0", - "@typescript-eslint/parser": "^6.12.0", + "@typescript-eslint/eslint-plugin": "^6.20.0", + "@typescript-eslint/parser": "^6.20.0", "@wysimark/standalone": "3.0.20", "@xstate/fsm": "^1.6.2", "@zxcvbn-ts/core": "^3.0.4", @@ -28,11 +29,13 @@ "del-cli": "^4.0.1", "dotenv": "^10.0.0", "dotenv-webpack": "^7.0.3", - "eslint": "^8.54.0", - "eslint-config-prettier": "^8.3.0", - "eslint-plugin-lit": "^1.10.1", - "eslint-plugin-wc": "^1.3.2", - "eslint-webpack-plugin": "^3.1.1", + "eslint": "^8.56.0", + "eslint-config-prettier": "^9.1.0", + "eslint-import-resolver-typescript": "^3.6.1", + "eslint-plugin-import-x": "^0.4.1", + "eslint-plugin-lit": "^1.11.0", + "eslint-plugin-wc": "^2.0.4", + "eslint-webpack-plugin": "^4.0.1", "fork-ts-checker-webpack-plugin": "^6.2.6", "fuse.js": "^6.5.3", "glob": "^8.1.0", @@ -52,17 +55,17 @@ "postcss-lit": "^1.1.1", "postcss-loader": "^6.1.1", "postinstall-postinstall": "^2.1.0", - "prettier": "^2.4.1", + "prettier": "^3.2.4", "pretty-ms": "^7.0.1", "query-string": "^8.1.0", "regex-colorize": "^0.0.3", "slugify": "^1.6.6", "style-loader": "^3.3.0", - "tailwindcss": "^3.2.7", + "tailwindcss": "^3.4.1", "terser-webpack-plugin": "^5.3.9", "ts-loader": "^9.2.6", "tsconfig-paths-webpack-plugin": "^4.1.0", - "typescript": "^4.5.2", + "typescript": "^5.3.3", "update-dotenv": "^1.1.1", "url-pattern": "^1.0.3", "webpack": "^5.88.0", @@ -80,9 +83,11 @@ "build:analyze": "BUNDLE_ANALYZER=true webpack --config webpack.prod.js", "start": "webpack serve --mode=development --config webpack.dev.js", "serve": "node scripts/serve.js", - "lint": "eslint --fix \"src/**/*.{ts,js}\"", + "lint": "eslint --fix .", + "lint:check": "eslint .", "lint:lit-analyzer": "lit-analyzer", - "format": "prettier --write \"src/**/*.{ts,js,html,css,json}\"", + "format": "prettier --write .", + "format:check": "prettier --check .", "localize:prepare": "yarn localize:extract && yarn localize:build", "localize:extract": "lit-localize extract", "localize:build": "lit-localize build" @@ -90,10 +95,11 @@ "devDependencies": { "@lit/localize-tools": "^0.7.1", "@web/dev-server-esbuild": "^0.3.3", - "@web/dev-server-import-maps": "^0.0.6", - "@web/dev-server-rollup": "^0.3.21", + "@web/dev-server-import-maps": "^0.2.0", + "@web/dev-server-rollup": "^0.6.1", "husky": "^8.0.3", "lint-staged": "^13.1.0", + "prettier-plugin-tailwindcss": "^0.5.11", "rollup-plugin-typescript-paths": "^1.4.0", "sinon": "^12.0.1", "ts-lit-plugin": "^2.0.1", @@ -113,7 +119,7 @@ "prettier --write", "eslint --fix --quiet" ], - "*.{html,css,json}": "prettier --write" + "*.{html,css,json,webmanifest}": "prettier --write" }, "husky": { "hooks": { @@ -121,10 +127,14 @@ } }, "engines": { - "node": ">=16" + "node": ">=18" }, "resolutions": { "**/playwright": "1.32.1", - "**/lit": "3.1.1" - } + "**/lit": "3.1.1", + "@web/dev-server-esbuild/esbuild": "^0.19.5" + }, + "browserslist": [ + "defaults" + ] } diff --git a/frontend/prettier.config.js b/frontend/prettier.config.js new file mode 100644 index 0000000000..3e919d187d --- /dev/null +++ b/frontend/prettier.config.js @@ -0,0 +1,28 @@ +/** @type {import("@ianvs/prettier-plugin-sort-imports").PrettierConfig} */ +module.exports = { + plugins: [ + "@ianvs/prettier-plugin-sort-imports", + "prettier-plugin-tailwindcss", + ], + tailwindFunctions: ["tw"], + importOrder: [ + "", + "", + "", + "", + // Parent directory items + "^\\.\\.$", + "^\\.\\.(/.+)$", + "", + // This directory items + "^\\.(/.+)$", + "", + "^\\.$", + "", + "^@/(.*)$", + "^~assets/(.*)", + "", + ], + importOrderParserPlugins: ["typescript", "decorators-legacy"], + importOrderTypeScriptVersion: "5.0.0", +}; diff --git a/frontend/scripts/serve.js b/frontend/scripts/serve.js index 6b4e12783e..d0c241f4b3 100644 --- a/frontend/scripts/serve.js +++ b/frontend/scripts/serve.js @@ -1,7 +1,8 @@ // Serve app locally without building with webpack, e.g. for e2e +const connectHistoryApiFallback = require("connect-history-api-fallback"); const express = require("express"); const { createProxyMiddleware } = require("http-proxy-middleware"); -const connectHistoryApiFallback = require("connect-history-api-fallback"); + const devServerConfig = require("../config/dev-server.js"); const app = express(); diff --git a/frontend/src/__mocks__/css.js b/frontend/src/__mocks__/_empty.js similarity index 60% rename from frontend/src/__mocks__/css.js rename to frontend/src/__mocks__/_empty.js index 38ac58d911..b7857c246c 100644 --- a/frontend/src/__mocks__/css.js +++ b/frontend/src/__mocks__/_empty.js @@ -1,10 +1,10 @@ /** - * Use to mock css files in tests. + * Use to mock files in tests. * * Usage in web-test-runner.config.mjs: * importMap: { * imports: { - * 'styles.css': '/src/__mocks__/css.js' + * 'styles.css': '/src/__mocks__/_empty.js' * }, * }, */ diff --git a/frontend/src/assets/favicons/apple-touch-icon.png b/frontend/src/assets/favicons/apple-touch-icon.png new file mode 100644 index 0000000000..3ff846241a Binary files /dev/null and b/frontend/src/assets/favicons/apple-touch-icon.png differ diff --git a/frontend/src/assets/favicons/favicon.ico b/frontend/src/assets/favicons/favicon.ico new file mode 100644 index 0000000000..a0f9a738d6 Binary files /dev/null and b/frontend/src/assets/favicons/favicon.ico differ diff --git a/frontend/src/assets/favicons/favicon.svg b/frontend/src/assets/favicons/favicon.svg new file mode 100644 index 0000000000..ddcbae18aa --- /dev/null +++ b/frontend/src/assets/favicons/favicon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frontend/src/assets/favicons/icon-192.png b/frontend/src/assets/favicons/icon-192.png new file mode 100644 index 0000000000..8d3269d852 Binary files /dev/null and b/frontend/src/assets/favicons/icon-192.png differ diff --git a/frontend/src/assets/favicons/icon-512.png b/frontend/src/assets/favicons/icon-512.png new file mode 100644 index 0000000000..8c658bf15e Binary files /dev/null and b/frontend/src/assets/favicons/icon-512.png differ diff --git a/frontend/src/assets/icons/copy-code.svg b/frontend/src/assets/icons/copy-code.svg deleted file mode 100644 index 93e69d8ef0..0000000000 --- a/frontend/src/assets/icons/copy-code.svg +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/frontend/src/classes/TailwindElement.ts b/frontend/src/classes/TailwindElement.ts index 3ca265665c..de44b7d722 100644 --- a/frontend/src/classes/TailwindElement.ts +++ b/frontend/src/classes/TailwindElement.ts @@ -1,4 +1,5 @@ import { LitElement } from "lit"; + import { theme } from "@/theme"; export class TailwindElement extends LitElement { diff --git a/frontend/src/components/not-found.ts b/frontend/src/components/not-found.ts index dc288d4e79..6bb2fe74e2 100644 --- a/frontend/src/components/not-found.ts +++ b/frontend/src/components/not-found.ts @@ -1,6 +1,6 @@ -import { LitElement, html } from "lit"; +import { localized, msg } from "@lit/localize"; +import { html, LitElement } from "lit"; import { customElement } from "lit/decorators.js"; -import { msg, localized } from "@lit/localize"; @customElement("btrix-not-found") @localized() @@ -10,7 +10,7 @@ export class NotFound extends LitElement { } render() { return html` -
+
${msg("Page not found")}
`; diff --git a/frontend/src/components/orgs-list.ts b/frontend/src/components/orgs-list.ts index 557190ed31..00a951c653 100644 --- a/frontend/src/components/orgs-list.ts +++ b/frontend/src/components/orgs-list.ts @@ -1,11 +1,11 @@ +import { localized, msg, str } from "@lit/localize"; +import type { SlInput } from "@shoelace-style/shoelace"; +import { type TemplateResult } from "lit"; import { customElement, property } from "lit/decorators.js"; -import { msg, localized, str } from "@lit/localize"; - -import type { CurrentUser, UserOrg } from "../types/user"; -import type { OrgData } from "../utils/orgs"; -import LiteElement, { html } from "../utils/LiteElement"; -import type { SlInput } from "@shoelace-style/shoelace"; +import type { CurrentUser, UserOrg } from "@/types/user"; +import LiteElement, { html } from "@/utils/LiteElement"; +import type { OrgData } from "@/utils/orgs"; @localized() @customElement("btrix-orgs-list") @@ -31,7 +31,7 @@ export class OrgsList extends LiteElement { } return html` -
`, )} ${when( score >= this.min && score < this.optimal, () => html`

${msg( - "Tip: To generate very strong passwords, consider using a password manager." + "Tip: To generate very strong passwords, consider using a password manager.", )}

- ` + `, )}
diff --git a/frontend/src/components/ui/relative-duration.ts b/frontend/src/components/ui/relative-duration.ts index a6df8bfce4..609b6547ed 100644 --- a/frontend/src/components/ui/relative-duration.ts +++ b/frontend/src/components/ui/relative-duration.ts @@ -1,6 +1,6 @@ -import { LitElement } from "lit"; -import { property, state, customElement } from "lit/decorators.js"; import { localized } from "@lit/localize"; +import { LitElement } from "lit"; +import { customElement, property, state } from "lit/decorators.js"; import humanizeDuration from "pretty-ms"; export type HumanizeOptions = { @@ -84,7 +84,7 @@ export class RelativeDuration extends LitElement { compact: this.compact, verbose: this.verbose, unitCount: this.unitCount, - } + }, ); } } diff --git a/frontend/src/components/ui/search-combobox.ts b/frontend/src/components/ui/search-combobox.ts index eec2d5e612..b4cc616c95 100644 --- a/frontend/src/components/ui/search-combobox.ts +++ b/frontend/src/components/ui/search-combobox.ts @@ -1,15 +1,18 @@ -import { LitElement, html, nothing } from "lit"; -import { property, state, query, customElement } from "lit/decorators.js"; -import { msg, localized } from "@lit/localize"; +import { localized, msg } from "@lit/localize"; +import type { SlInput, SlMenuItem } from "@shoelace-style/shoelace"; +import Fuse from "fuse.js"; +import { html, LitElement, nothing, type PropertyValues } from "lit"; +import { customElement, property, query, state } from "lit/decorators.js"; import { when } from "lit/directives/when.js"; import debounce from "lodash/fp/debounce"; -import Fuse from "fuse.js"; -import type { SlInput, SlMenuItem } from "@shoelace-style/shoelace"; -export type SelectEvent = CustomEvent<{ +import { type UnderlyingFunction } from "@/types/utils"; + +type SelectEventDetail = { key: string | null; value?: T; -}>; +}; +export type SelectEvent = CustomEvent>; const MIN_SEARCH_LENGTH = 2; const MAX_SEARCH_RESULTS = 10; @@ -39,7 +42,7 @@ export class SearchCombobox extends LitElement { placeholder: string = msg("Start typing to search"); @state() - private searchByValue: string = ""; + private searchByValue = ""; private get hasSearchStr() { return this.searchByValue.length >= MIN_SEARCH_LENGTH; @@ -49,7 +52,7 @@ export class SearchCombobox extends LitElement { private searchResultsOpen = false; @query("sl-input") - private input!: SlInput; + private readonly input!: SlInput; private fuse = new Fuse([], { keys: [], @@ -63,15 +66,19 @@ export class SearchCombobox extends LitElement { super.disconnectedCallback(); } - protected willUpdate(changedProperties: Map) { + protected willUpdate(changedProperties: PropertyValues) { if (changedProperties.get("selectedKey") && !this.selectedKey) { this.onSearchInput.cancel(); this.searchByValue = ""; } - if (changedProperties.has("searchKeys") && this.searchKeys) { + if (changedProperties.has("searchKeys")) { this.onSearchInput.cancel(); this.fuse = new Fuse([], { - ...(this.fuse as any).options, + ...( + this.fuse as unknown as { + options: ConstructorParameters[1]; + } + ).options, keys: this.searchKeys, }); } @@ -99,12 +106,12 @@ export class SearchCombobox extends LitElement { this.searchByValue = item.value; await this.updateComplete; this.dispatchEvent( - >new CustomEvent("btrix-select", { + new CustomEvent>("btrix-select", { detail: { - key: key, - value: item.value, + key: key ?? null, + value: item.value as T, }, - }) + }), ); }} > @@ -118,19 +125,21 @@ export class SearchCombobox extends LitElement { this.onSearchInput.cancel(); this.dispatchEvent(new CustomEvent("btrix-clear")); }} - @sl-input=${this.onSearchInput as () => void} + @sl-input=${this.onSearchInput as UnderlyingFunction< + typeof this.onSearchInput + >} > ${when( - this.selectedKey && this.keyLabels?.[this.selectedKey as string], + this.selectedKey && this.keyLabels?.[this.selectedKey], () => html`${this.keyLabels![this.selectedKey as string]}${this.keyLabels![this.selectedKey!]}`, - () => html`` + () => html``, )} ${this.renderSearchResults()} @@ -175,15 +184,15 @@ export class SearchCombobox extends LitElement { `; } return nothing; - }) + }), )} `; } - private onSearchInput = debounce(150)(() => { - this.searchByValue = this.input.value?.trim(); + private readonly onSearchInput = debounce(150)(() => { + this.searchByValue = this.input.value.trim(); - if (this.searchResultsOpen === false && this.hasSearchStr) { + if (!this.searchResultsOpen && this.hasSearchStr) { this.searchResultsOpen = true; } diff --git a/frontend/src/components/ui/section-heading.ts b/frontend/src/components/ui/section-heading.ts index 110698dd9d..ffb4808150 100644 --- a/frontend/src/components/ui/section-heading.ts +++ b/frontend/src/components/ui/section-heading.ts @@ -1,4 +1,4 @@ -import { LitElement, html, css } from "lit"; +import { css, html, LitElement } from "lit"; import { customElement } from "lit/decorators.js"; /** diff --git a/frontend/src/components/ui/select-crawler.ts b/frontend/src/components/ui/select-crawler.ts index b4630c64de..484963d779 100644 --- a/frontend/src/components/ui/select-crawler.ts +++ b/frontend/src/components/ui/select-crawler.ts @@ -1,12 +1,24 @@ +import { localized, msg } from "@lit/localize"; +import { type SlSelect } from "@shoelace-style/shoelace"; import { html } from "lit"; -import { property, state, customElement } from "lit/decorators.js"; -import { msg, localized } from "@lit/localize"; - -import type { AuthState } from "../../utils/AuthService"; -import type { CrawlerChannel } from "../../pages/org/types"; +import { customElement, property, state } from "lit/decorators.js"; +import capitalize from "lodash/fp/capitalize"; +import type { CrawlerChannel } from "@/pages/org/types"; +import type { AuthState } from "@/utils/AuthService"; import LiteElement from "@/utils/LiteElement"; -import capitalize from "lodash/fp/capitalize"; + +type SelectCrawlerChangeDetail = { + value: string | undefined; +}; + +export type SelectCrawlerChangeEvent = CustomEvent; + +type SelectCrawlerUpdateDetail = { + show: boolean; +}; + +export type SelectCrawlerUpdateEvent = CustomEvent; type CrawlerChannelsAPIResponse = { channels: CrawlerChannel[]; @@ -45,7 +57,7 @@ export class SelectCrawler extends LiteElement { private crawlerChannels?: CrawlerChannel[]; protected firstUpdated() { - this.fetchCrawlerChannels(); + void this.fetchCrawlerChannels(); } render() { @@ -63,15 +75,16 @@ export class SelectCrawler extends LiteElement { @sl-change=${this.onChange} @sl-focus=${() => { // Refetch to keep list up to date - this.fetchCrawlerChannels(); + void this.fetchCrawlerChannels(); }} @sl-hide=${this.stopProp} @sl-after-hide=${this.stopProp} > ${this.crawlerChannels?.map( - (crawler) => html` - ${capitalize(crawler.id)} - ` + (crawler) => + html` + ${capitalize(crawler.id)} + `, )} ${this.selectedCrawler ? html` @@ -87,19 +100,19 @@ export class SelectCrawler extends LiteElement { `; } - private onChange(e: any) { + private onChange(e: Event) { this.stopProp(e); this.selectedCrawler = this.crawlerChannels?.find( - ({ id }) => id === e.target.value + ({ id }) => id === (e.target as SlSelect).value, ); this.dispatchEvent( - new CustomEvent("on-change", { + new CustomEvent("on-change", { detail: { value: this.selectedCrawler?.id, }, - }) + }), ); } @@ -109,11 +122,11 @@ export class SelectCrawler extends LiteElement { private async fetchCrawlerChannels(): Promise { try { const channels = await this.getCrawlerChannels(); - this.crawlerChannels = channels as CrawlerChannel[]; + this.crawlerChannels = channels; if (this.crawlerChannel && !this.selectedCrawler) { this.selectedCrawler = this.crawlerChannels.find( - ({ id }) => id === this.crawlerChannel + ({ id }) => id === this.crawlerChannel, ); } @@ -124,19 +137,19 @@ export class SelectCrawler extends LiteElement { detail: { value: "default", }, - }) + }), ); this.selectedCrawler = this.crawlerChannels.find( - ({ id }) => id === this.crawlerChannel + ({ id }) => id === this.crawlerChannel, ); } this.dispatchEvent( - new CustomEvent("on-update", { + new CustomEvent("on-update", { detail: { show: this.crawlerChannels.length > 1, }, - }) + }), ); } catch (e) { this.notify({ @@ -151,7 +164,7 @@ export class SelectCrawler extends LiteElement { const data: CrawlerChannelsAPIResponse = await this.apiFetch( `/orgs/${this.orgId}/crawlconfigs/crawler-channels`, - this.authState! + this.authState!, ); return data.channels; @@ -162,7 +175,7 @@ export class SelectCrawler extends LiteElement { * Prevents bug where sl-dialog closes when dropdown closes * https://github.com/shoelace-style/shoelace/issues/170 */ - private stopProp(e: CustomEvent) { + private stopProp(e: Event) { e.stopPropagation(); } } diff --git a/frontend/src/components/ui/tab-list.ts b/frontend/src/components/ui/tab-list.ts index 220a8fe531..6ab9e435c7 100644 --- a/frontend/src/components/ui/tab-list.ts +++ b/frontend/src/components/ui/tab-list.ts @@ -1,11 +1,12 @@ -import { TailwindElement } from "@/classes/TailwindElement"; -import { LitElement, html, css } from "lit"; -import { property, queryAsync, customElement } from "lit/decorators.js"; +import { css, html, LitElement, type PropertyValues } from "lit"; +import { customElement, property, queryAsync } from "lit/decorators.js"; import { ifDefined } from "lit/directives/if-defined.js"; +import { TailwindElement } from "@/classes/TailwindElement"; + const DEFAULT_PANEL_ID = "default-panel"; -// Breakpoint in pixels for 2-column layout -const TWO_COL_SCREEN_MIN = 1032; +// postcss-lit-disable-next-line +export const TWO_COL_SCREEN_MIN_CSS = css`64.5rem`; /** * Tab list @@ -59,7 +60,7 @@ export class Tab extends TailwindElement { render() { return html`