From 9e2da4516f9e05212a2e4d892c9010f72fbc8300 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Mon, 1 Jul 2024 16:45:28 -0400 Subject: [PATCH] chore(python): add crawl_state query --- .github/workflows/python.yml | 39 ++++++++ javascript/package-lock.json | 4 +- javascript/package.json | 2 +- python/requirements.txt | 2 + python/setup.py | 2 +- python/spider/spider.py | 35 ++++--- python/tests/test_spider.py | 174 +++++++++++++++-------------------- 7 files changed, 140 insertions(+), 118 deletions(-) create mode 100644 .github/workflows/python.yml create mode 100644 python/requirements.txt diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml new file mode 100644 index 0000000..b6205fd --- /dev/null +++ b/.github/workflows/python.yml @@ -0,0 +1,39 @@ +name: Python CI + +on: + push: + branches: ["main"] + pull_request: + branches: ["main"] + +jobs: + build: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: [3.11, 3.12] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install . + pip install -r requirements.txt + working-directory: ./python + + - name: Run tests + run: | + pytest + working-directory: ./python/tests + env: + SPIDER_API_KEY: ${{ secrets.SPIDER_API_KEY }} + SPIDER_EMAIL: ${{ secrets.SPIDER_EMAIL }} + SPIDER_PASSWORD: ${{ secrets.SPIDER_PASSWORD }} \ No newline at end of file diff --git a/javascript/package-lock.json b/javascript/package-lock.json index 5531bb0..5a214e1 100644 --- a/javascript/package-lock.json +++ b/javascript/package-lock.json @@ -1,12 +1,12 @@ { "name": "@spider-cloud/spider-client", - "version": "0.0.37", + "version": "0.0.38", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@spider-cloud/spider-client", - "version": "0.0.37", + "version": "0.0.38", "license": "MIT", "devDependencies": { "@jest/globals": "^29.7.0", diff --git a/javascript/package.json b/javascript/package.json index aceb2c3..190db34 100644 --- a/javascript/package.json +++ b/javascript/package.json @@ -1,6 +1,6 @@ { "name": "@spider-cloud/spider-client", - "version": "0.0.37", + "version": "0.0.38", "description": "A Javascript SDK for Spider Cloud services", "scripts": { "test": "jest", diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000..8c99e88 --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,2 @@ +pytest +python-dotenv \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index 9f9df8b..17fc846 100644 --- a/python/setup.py +++ b/python/setup.py @@ -8,7 +8,7 @@ def read_file(fname): setup( name="spider-client", - version="0.0.37", + version="0.0.38", url="https://github.com/spider-rs/spider-clients/tree/main/python", author="Spider", author_email="jeff@a11ywatch.com", diff --git a/python/spider/spider.py b/python/spider/spider.py index 83c00d8..7ddb131 100644 --- a/python/spider/spider.py +++ b/python/spider/spider.py @@ -19,7 +19,7 @@ def api_post( self, endpoint: str, data: dict, - stream: bool, + stream: Optional[bool], content_type: str = "application/json", ): """ @@ -60,22 +60,29 @@ def api_get( self._handle_error(response, f"get from {endpoint}") def api_delete( - self, endpoint: str, stream: bool, content_type: str = "application/json" + self, + endpoint: str, + params: Optional[RequestParamsDict] = None, + stream: Optional[bool] = False, + content_type: Optional[str] = "application/json", ): """ Send a DELETE request to the specified endpoint. :param endpoint: The API endpoint from which to retrieve data. + :param params: Optional parameters to include in the DELETE request. + :param stream: Boolean indicating if the response should be streamed. + :param content_type: The content type of the request. :return: The JSON decoded response. """ headers = self._prepare_headers(content_type) response = self._delete_request( - f"https://api.spider.cloud/v1/{endpoint}", headers, stream - ) + f"https://api.spider.cloud/v1/{endpoint}", headers, params, stream + ) if response.status_code in [200, 202]: return response.json() else: - self._handle_error(response, f"get from {endpoint}") + self._handle_error(response, f"delete from {endpoint}") def scrape_url( self, @@ -261,17 +268,19 @@ def get_crawl_state( self, url: str, params: Optional[RequestParamsDict] = None, - stream: bool = False, - content_type: str = "application/json", + stream: Optional[bool] = False, + content_type: Optional[str] = "application/json", ): """ Retrieve the website active crawl state. :return: JSON response of the crawl state and credits used. """ - return self.api_post( - "data/crawl_status", {"url": url, **(params or {}, stream, content_type)} - ) + payload = {"url": url, "stream": stream, "content_type": content_type} + if params: + payload.update(params) + + return self.api_post("data/crawl_state", payload, stream) def get_credits(self): """ @@ -320,7 +329,7 @@ def _prepare_headers(self, content_type: str = "application/json"): return { "Content-Type": content_type, "Authorization": f"Bearer {self.api_key}", - "User-Agent": f"Spider-Client/0.0.37", + "User-Agent": f"Spider-Client/0.0.38", } def _post_request(self, url: str, data, headers, stream=False): @@ -329,8 +338,8 @@ def _post_request(self, url: str, data, headers, stream=False): def _get_request(self, url: str, headers, stream=False): return requests.get(url, headers=headers, stream=stream) - def _delete_request(self, url: str, headers, stream=False): - return requests.delete(url, headers=headers, stream=stream) + def _delete_request(self, url: str, headers, params=None, stream=False): + return requests.delete(url, headers=headers, params=params, stream=stream) def _handle_error(self, response, action): if response.status_code in [402, 409, 500]: diff --git a/python/tests/test_spider.py b/python/tests/test_spider.py index d8e56b9..19f55ad 100644 --- a/python/tests/test_spider.py +++ b/python/tests/test_spider.py @@ -1,109 +1,81 @@ +import pytest from spider.spider import Spider from spider.spider_types import RequestParamsDict +from dotenv import load_dotenv -def main(): - spider = Spider() +load_dotenv() - # Test scrape_url method - url = "http://example.com" - params: RequestParamsDict = { +@pytest.fixture +def spider(): + return Spider() + +@pytest.fixture +def url(): + return "http://example.com" + +@pytest.fixture +def params(): + return { "limit": 1, - "return_format": "html2text", + "return_format": "markdown", "depth": 2, "cache": True, + "domain": "example.com" } - try: - response = spider.scrape_url(url, params=params) - print("scrape_url response:", response) - except Exception as e: - print("scrape_url error:", e) - - # Test crawl_url method - try: - response = spider.crawl_url(url, params=params) - print("crawl_url response:", response) - except Exception as e: - print("crawl_url error:", e) - - # Test links method - try: - response = spider.links(url, params=params) - print("links response:", response) - except Exception as e: - print("links error:", e) - - # Test screenshot method - try: - response = spider.screenshot(url, params=params) - print("screenshot response:", response) - except Exception as e: - print("screenshot error:", e) - - # Test search method - try: - response = spider.search("example search query", params=params) - print("search response:", response) - except Exception as e: - print("search error:", e) - - # Test transform method - try: - transform_data = [{"html": "Example", "url": url}] - response = spider.transform(transform_data, params=params) - print("transform response:", response) - except Exception as e: - print("transform error:", e) - - # Test extract_contacts method - try: - response = spider.extract_contacts(url, params=params) - print("extract_contacts response:", response) - except Exception as e: - print("extract_contacts error:", e) - - # Test label method - try: - response = spider.label(url, params=params) - print("label response:", response) - except Exception as e: - print("label error:", e) - - # Test get_crawl_state method - try: - response = spider.get_crawl_state(url, params=params) - print("get_crawl_state response:", response) - except Exception as e: - print("get_crawl_state error:", e) - - # Test get_credits method - try: - response = spider.get_credits() - print("get_credits response:", response) - except Exception as e: - print("get_credits error:", e) - - # Test data_post method - try: - table = "websites" - post_data: RequestParamsDict = {"url": url} - response = spider.data_post(table, post_data) - print("data_post response:", response) - except Exception as e: - print("data_post error:", e) - - # Test data_get method - try: - response = spider.data_get(table, params=params) - print("data_get response:", response) - except Exception as e: - print("data_get error:", e) - - # Test data_delete method - try: - response = spider.data_delete(table, params=params) - print("data_delete response:", response) - except Exception as e: - print("data_delete error:", e) - -if __name__ == "__main__": - main() + +def test_scrape_url(spider, url, params): + response = spider.scrape_url(url, params=params) + assert response is not None + +def test_crawl_url(spider, url, params): + response = spider.crawl_url(url, params=params) + assert response is not None + +def test_links(spider, url, params): + response = spider.links(url, params=params) + assert response is not None + +def test_screenshot(spider, url, params): + response = spider.screenshot(url, params=params) + assert response is not None + +def test_search(spider, params): + response = spider.search("example search query", params=params) + assert response is not None + +def test_transform(spider, url, params): + transform_data = [{"html": "Example", "url": url}] + response = spider.transform(transform_data, params=params) + assert response is not None + +def test_extract_contacts(spider, url, params): + response = spider.extract_contacts(url, params=params) + assert response is not None + +def test_label(spider, url, params): + response = spider.label(url, params=params) + assert response is not None + +def test_get_crawl_state(spider, url, params): + response = spider.get_crawl_state(url, params=params) + assert response is not None + +def test_get_credits(spider): + response = spider.get_credits() + assert response is not None + +def test_data_post(spider, url): + table = "websites" + post_data: RequestParamsDict = {"url": url} + response = spider.data_post(table, post_data) + assert response is not None + +def test_data_get(spider, url, params): + table = "websites" + response = spider.data_get(table, params=params) + assert response is not None + +# def test_data_delete(spider, url, params): +# table = "websites" +# response = spider.data_delete(table, params=params) +# assert response is not None \ No newline at end of file