-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Added new required request headers for SEC EDGAR * Added UTF-8 header * Changed dependecy from broken fuzzwuzz to new rapidfuzz. Fixes #28 * Fixes #24 (with user comment poor-mans patch for checking rate limits) * minor formatting adjustments for code readabiltiy Changes to be committed: modified: edgar/company.py modified: edgar/document.py modified: edgar/edgar.py modified: requirements.txt
- Loading branch information
Showing
4 changed files
with
84 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
# -*- coding: UTF-8 -*- | ||
from typing import List | ||
import os | ||
import requests | ||
|
@@ -7,6 +8,19 @@ | |
|
||
BASE_URL = "https://www.sec.gov" | ||
|
||
# The required SEC EDGAR request header | ||
SEC_HEADERS = { | ||
'user-agent': 'Edgar [email protected]', | ||
'accept-encoding': 'gzip, deflate', | ||
'host': 'www.sec.gov', | ||
'referer': 'https://www.sec.gov/', | ||
'cache-control': 'no-cache', | ||
#'connection': 'close' | ||
#'connection': 'keep-alive' | ||
} | ||
# Set new default requests header | ||
headers = SEC_HEADERS | ||
|
||
class Company(): | ||
|
||
def __init__(self, name, cik, timeout=10): | ||
|
@@ -15,15 +29,14 @@ def __init__(self, name, cik, timeout=10): | |
self.url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}" | ||
self.timeout = timeout | ||
self._document_urls = [] | ||
|
||
self.get_company_info() | ||
|
||
@property | ||
def document_urls(self): | ||
return list(set(self._document_urls)) | ||
|
||
def _get(self, url): | ||
return requests.get(url, timeout=self.timeout) | ||
return requests.get(url, timeout=self.timeout, headers=SEC_HEADERS) | ||
|
||
def get_company_info(self): | ||
page = html.fromstring(self._get(self.url).content) | ||
|
@@ -127,7 +140,7 @@ def get_10K(self) -> List[lxml.html.HtmlElement]: | |
|
||
@classmethod | ||
def get_request(cls, href, isxml=False, timeout=10): | ||
page = requests.get(href, timeout=timeout) | ||
page = requests.get(href, timeout=timeout, headers=SEC_HEADERS) | ||
if isxml: | ||
p = etree.XMLParser(huge_tree=True) | ||
return etree.fromstring(page.content, parser=p) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,25 @@ | ||
# -*- coding: UTF-8 -*- | ||
import requests | ||
from lxml import html | ||
|
||
# The required SEC EDGAR request header | ||
SEC_HEADERS = { | ||
'user-agent': 'Edgar [email protected]', | ||
'accept-encoding': 'gzip, deflate', | ||
'host': 'www.sec.gov', | ||
'referer': 'https://www.sec.gov/', | ||
'cache-control': 'no-cache', | ||
#'connection': 'close' | ||
#'connection': 'keep-alive' | ||
} | ||
# Set new default requests header | ||
headers = SEC_HEADERS | ||
|
||
class Document: | ||
|
||
def __init__(self, url, timeout=10): | ||
self.url = url | ||
self.text = requests.get(self.url, timeout=timeout).content | ||
self.text = requests.get(self.url, timeout=timeout, headers=SEC_HEADERS).content | ||
|
||
class Documents(str): | ||
|
||
|
@@ -14,13 +28,13 @@ def __get_text_from_list__(self, arr): | |
|
||
def __init__(self, url, timeout=10): | ||
self.url = url | ||
page = requests.get(self.url, timeout=timeout) | ||
page = requests.get(self.url, timeout=timeout, headers=SEC_HEADERS) | ||
tree = html.fromstring(page.content) | ||
content = tree.find_class("formContent")[0] | ||
info_head = self.__get_text_from_list__(content.find_class("infoHead")) | ||
info = self.__get_text_from_list__(content.find_class("info")) | ||
self.content = dict(zip(info_head, info)) | ||
self.element = html.fromstring(requests.get(self.url, timeout=timeout).content) | ||
self.element = html.fromstring(requests.get(self.url, timeout=timeout, headers=SEC_HEADERS).content) | ||
|
||
def __repr__(self): | ||
return str(self.__dict__) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,60 @@ | ||
# -*- coding: UTF-8 -*- | ||
from typing import Tuple, List, Any, Dict | ||
from lxml import html | ||
from tqdm import tqdm | ||
import os | ||
import requests | ||
from fuzzywuzzy import process, fuzz | ||
from rapidfuzz import process, fuzz | ||
|
||
|
||
class Edgar(): | ||
|
||
def __init__(self, companies_page_path=None): | ||
all_companies_content : str | ||
if companies_page_path is not None and os.path.isfile(companies_page_path): | ||
all_companies_content = open(companies_page_path, encoding="latin-1").read() | ||
else: | ||
all_companies_page = requests.get("https://www.sec.gov/Archives/edgar/cik-lookup-data.txt") | ||
all_companies_content = all_companies_page.content.decode("latin1") | ||
all_companies_array = all_companies_content.split("\n") | ||
del all_companies_array[-1] | ||
all_companies_array_rev = [] | ||
for i, item in enumerate(all_companies_array): | ||
if item == "": | ||
continue | ||
_name, _cik = Edgar.split_raw_string_to_cik_name(item) | ||
all_companies_array[i] = (_name, _cik) | ||
all_companies_array_rev.append((_cik, _name)) | ||
self.all_companies_dict = dict(all_companies_array) | ||
self.all_companies_dict_rev = dict(all_companies_array_rev) | ||
|
||
# The required SEC EDGAR request header | ||
SEC_HEADERS = { | ||
'user-agent': 'Edgar [email protected]', | ||
'accept-encoding': 'gzip, deflate', | ||
'host': 'www.sec.gov', | ||
'referer': 'https://www.sec.gov/', | ||
'cache-control': 'no-cache', | ||
#'connection': 'close' | ||
#'connection': 'keep-alive' | ||
} | ||
# Set new default requests header | ||
headers = SEC_HEADERS | ||
|
||
# Add patch from: | ||
# https://github.com/NetSPI/NetblockTool/issues/3#issuecomment-897138800 | ||
# Here we use the while loop as a poor-mans patch for rate limiting. | ||
# When that occurs, the first item returns a non-html doc... | ||
rate_limited = 0 | ||
while not rate_limited: | ||
|
||
all_companies_content : str | ||
if companies_page_path is not None and os.path.isfile(companies_page_path): | ||
all_companies_content = open(companies_page_path, encoding="latin-1").read() | ||
else: | ||
edgar_url = "https://www.sec.gov/Archives/edgar/cik-lookup-data.txt" | ||
all_companies_page = requests.get(edgar_url, headers=SEC_HEADERS) | ||
all_companies_content = all_companies_page.content.decode("latin1") | ||
all_companies_array = all_companies_content.split("\n") | ||
|
||
# Check for rate limiting garbage... | ||
item_arr = all_companies_array[0].split(":") | ||
if item_arr[0] != '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http': | ||
rate_limited = 1 | ||
|
||
del all_companies_array[-1] | ||
all_companies_array_rev = [] | ||
for i, item in enumerate(all_companies_array): | ||
if item == "": | ||
continue | ||
_name, _cik = Edgar.split_raw_string_to_cik_name(item) | ||
all_companies_array[i] = (_name, _cik) | ||
all_companies_array_rev.append((_cik, _name)) | ||
self.all_companies_dict = dict(all_companies_array) | ||
self.all_companies_dict_rev = dict(all_companies_array_rev) | ||
|
||
def get_cik_by_company_name(self, name) -> str: | ||
return self.all_companies_dict[name] | ||
|
@@ -36,7 +66,7 @@ def match_company_by_company_name(self, name, top=5, progress=True) -> List[Dict | |
): | ||
result.append({"company_name": company, "cik": cik, "score": fuzz.partial_ratio(name, company)}) | ||
return sorted(result, key=lambda row: row["score"], reverse=True)[:top] | ||
|
||
def get_company_name_by_cik(self, cik) -> str: | ||
return self.all_companies_dict_rev[cik] | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
requests | ||
lxml | ||
tqdm | ||
fuzzywuzzy[speedup] | ||
rapidfuzz | ||
|