diff --git a/edgar/company.py b/edgar/company.py index 1661f01..dcaa73d 100644 --- a/edgar/company.py +++ b/edgar/company.py @@ -1,3 +1,4 @@ +# -*- coding: UTF-8 -*- from typing import List import os import requests @@ -7,6 +8,19 @@ BASE_URL = "https://www.sec.gov" +# The required SEC EDGAR request header +SEC_HEADERS = { + 'user-agent': 'Edgar oit@sec.gov', + 'accept-encoding': 'gzip, deflate', + 'host': 'www.sec.gov', + 'referer': 'https://www.sec.gov/', + 'cache-control': 'no-cache', + #'connection': 'close' + #'connection': 'keep-alive' +} +# Set new default requests header +headers = SEC_HEADERS + class Company(): def __init__(self, name, cik, timeout=10): @@ -15,7 +29,6 @@ def __init__(self, name, cik, timeout=10): self.url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}" self.timeout = timeout self._document_urls = [] - self.get_company_info() @property @@ -23,7 +36,7 @@ def document_urls(self): return list(set(self._document_urls)) def _get(self, url): - return requests.get(url, timeout=self.timeout) + return requests.get(url, timeout=self.timeout, headers=SEC_HEADERS) def get_company_info(self): page = html.fromstring(self._get(self.url).content) @@ -127,7 +140,7 @@ def get_10K(self) -> List[lxml.html.HtmlElement]: @classmethod def get_request(cls, href, isxml=False, timeout=10): - page = requests.get(href, timeout=timeout) + page = requests.get(href, timeout=timeout, headers=SEC_HEADERS) if isxml: p = etree.XMLParser(huge_tree=True) return etree.fromstring(page.content, parser=p) diff --git a/edgar/document.py b/edgar/document.py index fed2e4c..2b8113e 100644 --- a/edgar/document.py +++ b/edgar/document.py @@ -1,11 +1,25 @@ +# -*- coding: UTF-8 -*- import requests from lxml import html +# The required SEC EDGAR request header +SEC_HEADERS = { + 'user-agent': 'Edgar oit@sec.gov', + 'accept-encoding': 'gzip, deflate', + 'host': 'www.sec.gov', + 'referer': 'https://www.sec.gov/', + 'cache-control': 'no-cache', + #'connection': 'close' + #'connection': 'keep-alive' +} +# Set new default requests header +headers = SEC_HEADERS + class Document: def __init__(self, url, timeout=10): self.url = url - self.text = requests.get(self.url, timeout=timeout).content + self.text = requests.get(self.url, timeout=timeout, headers=SEC_HEADERS).content class Documents(str): @@ -14,13 +28,13 @@ def __get_text_from_list__(self, arr): def __init__(self, url, timeout=10): self.url = url - page = requests.get(self.url, timeout=timeout) + page = requests.get(self.url, timeout=timeout, headers=SEC_HEADERS) tree = html.fromstring(page.content) content = tree.find_class("formContent")[0] info_head = self.__get_text_from_list__(content.find_class("infoHead")) info = self.__get_text_from_list__(content.find_class("info")) self.content = dict(zip(info_head, info)) - self.element = html.fromstring(requests.get(self.url, timeout=timeout).content) + self.element = html.fromstring(requests.get(self.url, timeout=timeout, headers=SEC_HEADERS).content) def __repr__(self): return str(self.__dict__) diff --git a/edgar/edgar.py b/edgar/edgar.py index bde3c4b..aeee817 100644 --- a/edgar/edgar.py +++ b/edgar/edgar.py @@ -1,30 +1,60 @@ +# -*- coding: UTF-8 -*- from typing import Tuple, List, Any, Dict from lxml import html from tqdm import tqdm import os import requests -from fuzzywuzzy import process, fuzz +from rapidfuzz import process, fuzz + class Edgar(): def __init__(self, companies_page_path=None): - all_companies_content : str - if companies_page_path is not None and os.path.isfile(companies_page_path): - all_companies_content = open(companies_page_path, encoding="latin-1").read() - else: - all_companies_page = requests.get("https://www.sec.gov/Archives/edgar/cik-lookup-data.txt") - all_companies_content = all_companies_page.content.decode("latin1") - all_companies_array = all_companies_content.split("\n") - del all_companies_array[-1] - all_companies_array_rev = [] - for i, item in enumerate(all_companies_array): - if item == "": - continue - _name, _cik = Edgar.split_raw_string_to_cik_name(item) - all_companies_array[i] = (_name, _cik) - all_companies_array_rev.append((_cik, _name)) - self.all_companies_dict = dict(all_companies_array) - self.all_companies_dict_rev = dict(all_companies_array_rev) + + # The required SEC EDGAR request header + SEC_HEADERS = { + 'user-agent': 'Edgar oit@sec.gov', + 'accept-encoding': 'gzip, deflate', + 'host': 'www.sec.gov', + 'referer': 'https://www.sec.gov/', + 'cache-control': 'no-cache', + #'connection': 'close' + #'connection': 'keep-alive' + } + # Set new default requests header + headers = SEC_HEADERS + + # Add patch from: + # https://github.com/NetSPI/NetblockTool/issues/3#issuecomment-897138800 + # Here we use the while loop as a poor-mans patch for rate limiting. + # When that occurs, the first item returns a non-html doc... + rate_limited = 0 + while not rate_limited: + + all_companies_content : str + if companies_page_path is not None and os.path.isfile(companies_page_path): + all_companies_content = open(companies_page_path, encoding="latin-1").read() + else: + edgar_url = "https://www.sec.gov/Archives/edgar/cik-lookup-data.txt" + all_companies_page = requests.get(edgar_url, headers=SEC_HEADERS) + all_companies_content = all_companies_page.content.decode("latin1") + all_companies_array = all_companies_content.split("\n") + + # Check for rate limiting garbage... + item_arr = all_companies_array[0].split(":") + if item_arr[0] != ' str: return self.all_companies_dict[name] @@ -36,7 +66,7 @@ def match_company_by_company_name(self, name, top=5, progress=True) -> List[Dict ): result.append({"company_name": company, "cik": cik, "score": fuzz.partial_ratio(name, company)}) return sorted(result, key=lambda row: row["score"], reverse=True)[:top] - + def get_company_name_by_cik(self, cik) -> str: return self.all_companies_dict_rev[cik] diff --git a/requirements.txt b/requirements.txt index 8a8ee4c..83db4dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ requests lxml tqdm -fuzzywuzzy[speedup] +rapidfuzz +