♦️ - maintenace update for required SEC headers (#29)

* Added new required request headers for SEC EDGAR * Added UTF-8 header * Changed dependecy from broken fuzzwuzz to new rapidfuzz. Fixes #28 * Fixes #24 (with user comment poor-mans patch for checking rate limits) * minor formatting adjustments for code readabiltiy Changes to be committed: modified: edgar/company.py modified: edgar/document.py modified: edgar/edgar.py modified: requirements.txt
joeyism · Nov 15, 2022 · ae74960 · ae74960
1 parent ee67e7a
commit ae74960
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 26 deletions.
diff --git a/edgar/company.py b/edgar/company.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 from typing import List
 import os
 import requests
@@ -7,6 +8,19 @@
 
 BASE_URL = "https://www.sec.gov"
 
+# The required SEC EDGAR request header
+SEC_HEADERS = {
+    'user-agent': 'Edgar [email protected]',
+    'accept-encoding':  'gzip, deflate',
+    'host':  'www.sec.gov',
+    'referer': 'https://www.sec.gov/', 
+    'cache-control': 'no-cache', 
+    #'connection': 'close'
+    #'connection': 'keep-alive'
+}
+# Set new default requests header
+headers = SEC_HEADERS
+
 class Company():
 
     def __init__(self, name, cik, timeout=10):
@@ -15,15 +29,14 @@ def __init__(self, name, cik, timeout=10):
         self.url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}"
         self.timeout = timeout
         self._document_urls = []
-
         self.get_company_info()
 
     @property
     def document_urls(self):
       return list(set(self._document_urls))
 
     def _get(self, url):
-      return requests.get(url, timeout=self.timeout)
+      return requests.get(url, timeout=self.timeout, headers=SEC_HEADERS)
 
     def get_company_info(self):
         page = html.fromstring(self._get(self.url).content)
@@ -127,7 +140,7 @@ def get_10K(self) -> List[lxml.html.HtmlElement]:
 
     @classmethod
     def get_request(cls, href, isxml=False, timeout=10):
-        page = requests.get(href, timeout=timeout)
+        page = requests.get(href, timeout=timeout, headers=SEC_HEADERS)
         if isxml:
           p = etree.XMLParser(huge_tree=True)
           return etree.fromstring(page.content, parser=p)

diff --git a/edgar/document.py b/edgar/document.py
@@ -1,11 +1,25 @@
+# -*- coding: UTF-8 -*-
 import requests
 from lxml import html
 
+# The required SEC EDGAR request header
+SEC_HEADERS = {
+    'user-agent': 'Edgar [email protected]',
+    'accept-encoding':  'gzip, deflate',
+    'host':  'www.sec.gov',
+    'referer': 'https://www.sec.gov/', 
+    'cache-control': 'no-cache', 
+    #'connection': 'close'
+    #'connection': 'keep-alive'
+}
+# Set new default requests header
+headers = SEC_HEADERS
+
 class Document:
 
   def __init__(self, url, timeout=10):
     self.url = url
-    self.text = requests.get(self.url, timeout=timeout).content
+    self.text = requests.get(self.url, timeout=timeout, headers=SEC_HEADERS).content
 
 class Documents(str):
 
@@ -14,13 +28,13 @@ def __get_text_from_list__(self, arr):
 
   def __init__(self, url, timeout=10):
     self.url = url
-    page = requests.get(self.url, timeout=timeout)
+    page = requests.get(self.url, timeout=timeout, headers=SEC_HEADERS)
     tree = html.fromstring(page.content)
     content = tree.find_class("formContent")[0]
     info_head = self.__get_text_from_list__(content.find_class("infoHead"))
     info = self.__get_text_from_list__(content.find_class("info"))
     self.content = dict(zip(info_head, info))
-    self.element = html.fromstring(requests.get(self.url, timeout=timeout).content)
+    self.element = html.fromstring(requests.get(self.url, timeout=timeout, headers=SEC_HEADERS).content)
 
   def __repr__(self):
     return str(self.__dict__)

diff --git a/edgar/edgar.py b/edgar/edgar.py
@@ -1,30 +1,60 @@
+# -*- coding: UTF-8 -*-
 from typing import Tuple, List, Any, Dict
 from lxml import html
 from tqdm import tqdm
 import os
 import requests
-from fuzzywuzzy import process, fuzz
+from rapidfuzz import process, fuzz
+
 
 class Edgar():
 
     def __init__(self, companies_page_path=None):
-        all_companies_content : str
-        if companies_page_path is not None and os.path.isfile(companies_page_path):
-            all_companies_content = open(companies_page_path, encoding="latin-1").read()
-        else:
-            all_companies_page = requests.get("https://www.sec.gov/Archives/edgar/cik-lookup-data.txt")
-            all_companies_content = all_companies_page.content.decode("latin1")
-        all_companies_array = all_companies_content.split("\n")
-        del all_companies_array[-1]
-        all_companies_array_rev = []
-        for i, item in enumerate(all_companies_array):
-            if item == "":
-                continue
-            _name, _cik = Edgar.split_raw_string_to_cik_name(item)
-            all_companies_array[i] = (_name, _cik)
-            all_companies_array_rev.append((_cik, _name))
-        self.all_companies_dict = dict(all_companies_array)
-        self.all_companies_dict_rev = dict(all_companies_array_rev)
+
+        # The required SEC EDGAR request header
+        SEC_HEADERS = {
+            'user-agent': 'Edgar [email protected]',
+            'accept-encoding':  'gzip, deflate',
+            'host':  'www.sec.gov',
+            'referer': 'https://www.sec.gov/', 
+            'cache-control': 'no-cache', 
+            #'connection': 'close'
+            #'connection': 'keep-alive'
+        }
+        # Set new default requests header
+        headers = SEC_HEADERS
+
+        # Add patch from:
+        # https://github.com/NetSPI/NetblockTool/issues/3#issuecomment-897138800
+        # Here we use the while loop as a poor-mans patch for rate limiting.
+        # When that occurs, the first item returns a non-html doc...
+        rate_limited = 0
+        while not rate_limited:
+
+            all_companies_content : str
+            if companies_page_path is not None and os.path.isfile(companies_page_path):
+                all_companies_content = open(companies_page_path, encoding="latin-1").read()
+            else:
+                edgar_url = "https://www.sec.gov/Archives/edgar/cik-lookup-data.txt"
+                all_companies_page = requests.get(edgar_url, headers=SEC_HEADERS)
+                all_companies_content = all_companies_page.content.decode("latin1")
+            all_companies_array = all_companies_content.split("\n")
+
+            # Check for rate limiting garbage...
+            item_arr = all_companies_array[0].split(":")
+            if item_arr[0] != '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http':
+                rate_limited = 1
+
+            del all_companies_array[-1]
+            all_companies_array_rev = []
+            for i, item in enumerate(all_companies_array):
+                if item == "":
+                    continue
+                _name, _cik = Edgar.split_raw_string_to_cik_name(item)
+                all_companies_array[i] = (_name, _cik)
+                all_companies_array_rev.append((_cik, _name))
+            self.all_companies_dict = dict(all_companies_array)
+            self.all_companies_dict_rev = dict(all_companies_array_rev)
 
     def get_cik_by_company_name(self, name) -> str:
         return self.all_companies_dict[name]
@@ -36,7 +66,7 @@ def match_company_by_company_name(self, name, top=5, progress=True) -> List[Dict
         ):
             result.append({"company_name": company, "cik": cik, "score": fuzz.partial_ratio(name, company)})
         return sorted(result, key=lambda row: row["score"], reverse=True)[:top]
-
+	
     def get_company_name_by_cik(self, cik) -> str:
         return self.all_companies_dict_rev[cik]
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 requests
 lxml
 tqdm
-fuzzywuzzy[speedup]
+rapidfuzz
+