Skip to content

Commit

Permalink
♦️ - maintenace update for required SEC headers (#29)
Browse files Browse the repository at this point in the history
* Added new required request headers for SEC EDGAR
* Added UTF-8 header
* Changed dependecy from broken fuzzwuzz to new rapidfuzz. Fixes #28
* Fixes #24 (with user comment poor-mans patch for checking rate limits)
* minor formatting adjustments for code readabiltiy

Changes to be committed:
	modified:   edgar/company.py
	modified:   edgar/document.py
	modified:   edgar/edgar.py
	modified:   requirements.txt
  • Loading branch information
eabase authored Nov 15, 2022
1 parent ee67e7a commit ae74960
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 26 deletions.
19 changes: 16 additions & 3 deletions edgar/company.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: UTF-8 -*-
from typing import List
import os
import requests
Expand All @@ -7,6 +8,19 @@

BASE_URL = "https://www.sec.gov"

# The required SEC EDGAR request header
SEC_HEADERS = {
'user-agent': 'Edgar [email protected]',
'accept-encoding': 'gzip, deflate',
'host': 'www.sec.gov',
'referer': 'https://www.sec.gov/',
'cache-control': 'no-cache',
#'connection': 'close'
#'connection': 'keep-alive'
}
# Set new default requests header
headers = SEC_HEADERS

class Company():

def __init__(self, name, cik, timeout=10):
Expand All @@ -15,15 +29,14 @@ def __init__(self, name, cik, timeout=10):
self.url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}"
self.timeout = timeout
self._document_urls = []

self.get_company_info()

@property
def document_urls(self):
return list(set(self._document_urls))

def _get(self, url):
return requests.get(url, timeout=self.timeout)
return requests.get(url, timeout=self.timeout, headers=SEC_HEADERS)

def get_company_info(self):
page = html.fromstring(self._get(self.url).content)
Expand Down Expand Up @@ -127,7 +140,7 @@ def get_10K(self) -> List[lxml.html.HtmlElement]:

@classmethod
def get_request(cls, href, isxml=False, timeout=10):
page = requests.get(href, timeout=timeout)
page = requests.get(href, timeout=timeout, headers=SEC_HEADERS)
if isxml:
p = etree.XMLParser(huge_tree=True)
return etree.fromstring(page.content, parser=p)
Expand Down
20 changes: 17 additions & 3 deletions edgar/document.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
# -*- coding: UTF-8 -*-
import requests
from lxml import html

# The required SEC EDGAR request header
SEC_HEADERS = {
'user-agent': 'Edgar [email protected]',
'accept-encoding': 'gzip, deflate',
'host': 'www.sec.gov',
'referer': 'https://www.sec.gov/',
'cache-control': 'no-cache',
#'connection': 'close'
#'connection': 'keep-alive'
}
# Set new default requests header
headers = SEC_HEADERS

class Document:

def __init__(self, url, timeout=10):
self.url = url
self.text = requests.get(self.url, timeout=timeout).content
self.text = requests.get(self.url, timeout=timeout, headers=SEC_HEADERS).content

class Documents(str):

Expand All @@ -14,13 +28,13 @@ def __get_text_from_list__(self, arr):

def __init__(self, url, timeout=10):
self.url = url
page = requests.get(self.url, timeout=timeout)
page = requests.get(self.url, timeout=timeout, headers=SEC_HEADERS)
tree = html.fromstring(page.content)
content = tree.find_class("formContent")[0]
info_head = self.__get_text_from_list__(content.find_class("infoHead"))
info = self.__get_text_from_list__(content.find_class("info"))
self.content = dict(zip(info_head, info))
self.element = html.fromstring(requests.get(self.url, timeout=timeout).content)
self.element = html.fromstring(requests.get(self.url, timeout=timeout, headers=SEC_HEADERS).content)

def __repr__(self):
return str(self.__dict__)
Expand Down
68 changes: 49 additions & 19 deletions edgar/edgar.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,60 @@
# -*- coding: UTF-8 -*-
from typing import Tuple, List, Any, Dict
from lxml import html
from tqdm import tqdm
import os
import requests
from fuzzywuzzy import process, fuzz
from rapidfuzz import process, fuzz


class Edgar():

def __init__(self, companies_page_path=None):
all_companies_content : str
if companies_page_path is not None and os.path.isfile(companies_page_path):
all_companies_content = open(companies_page_path, encoding="latin-1").read()
else:
all_companies_page = requests.get("https://www.sec.gov/Archives/edgar/cik-lookup-data.txt")
all_companies_content = all_companies_page.content.decode("latin1")
all_companies_array = all_companies_content.split("\n")
del all_companies_array[-1]
all_companies_array_rev = []
for i, item in enumerate(all_companies_array):
if item == "":
continue
_name, _cik = Edgar.split_raw_string_to_cik_name(item)
all_companies_array[i] = (_name, _cik)
all_companies_array_rev.append((_cik, _name))
self.all_companies_dict = dict(all_companies_array)
self.all_companies_dict_rev = dict(all_companies_array_rev)

# The required SEC EDGAR request header
SEC_HEADERS = {
'user-agent': 'Edgar [email protected]',
'accept-encoding': 'gzip, deflate',
'host': 'www.sec.gov',
'referer': 'https://www.sec.gov/',
'cache-control': 'no-cache',
#'connection': 'close'
#'connection': 'keep-alive'
}
# Set new default requests header
headers = SEC_HEADERS

# Add patch from:
# https://github.com/NetSPI/NetblockTool/issues/3#issuecomment-897138800
# Here we use the while loop as a poor-mans patch for rate limiting.
# When that occurs, the first item returns a non-html doc...
rate_limited = 0
while not rate_limited:

all_companies_content : str
if companies_page_path is not None and os.path.isfile(companies_page_path):
all_companies_content = open(companies_page_path, encoding="latin-1").read()
else:
edgar_url = "https://www.sec.gov/Archives/edgar/cik-lookup-data.txt"
all_companies_page = requests.get(edgar_url, headers=SEC_HEADERS)
all_companies_content = all_companies_page.content.decode("latin1")
all_companies_array = all_companies_content.split("\n")

# Check for rate limiting garbage...
item_arr = all_companies_array[0].split(":")
if item_arr[0] != '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http':
rate_limited = 1

del all_companies_array[-1]
all_companies_array_rev = []
for i, item in enumerate(all_companies_array):
if item == "":
continue
_name, _cik = Edgar.split_raw_string_to_cik_name(item)
all_companies_array[i] = (_name, _cik)
all_companies_array_rev.append((_cik, _name))
self.all_companies_dict = dict(all_companies_array)
self.all_companies_dict_rev = dict(all_companies_array_rev)

def get_cik_by_company_name(self, name) -> str:
return self.all_companies_dict[name]
Expand All @@ -36,7 +66,7 @@ def match_company_by_company_name(self, name, top=5, progress=True) -> List[Dict
):
result.append({"company_name": company, "cik": cik, "score": fuzz.partial_ratio(name, company)})
return sorted(result, key=lambda row: row["score"], reverse=True)[:top]

def get_company_name_by_cik(self, cik) -> str:
return self.all_companies_dict_rev[cik]

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
requests
lxml
tqdm
fuzzywuzzy[speedup]
rapidfuzz

0 comments on commit ae74960

Please sign in to comment.