Skip to content

Commit

Permalink
Search twin commits and misc other improvements (#342)
Browse files Browse the repository at this point in the history
- Implemented twin commits, closes #147 
- Using basemodel again (pydantic)
- Fixed the logger, improved log aesthetics and readability
- github and jira issues are collected using their respective APIs (GITHUB token required via .env file)
- Better word extraction and filtering based on tests conducted on the small dataset.
- Report handlers now create nested folders
- updated to python 3.10
- skip GitHub when fetching references
- refactored git and raw_commit modules: all commit IDs, timestamp, parent, message and changed files are
collected with a single call to git executable
- refactored logging

- Fixes:
#339  
#341  
#334  
#331  
#326 
#336
  • Loading branch information
sacca97 authored Nov 3, 2022
1 parent 881dd45 commit 7284e34
Show file tree
Hide file tree
Showing 69 changed files with 2,140 additions and 2,111 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ jobs:
# Maps tcp port 5432 on service container to the host
- 5432:5432
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: 3.10.6
- name: Setup virtual environment
run: |
python -m pip install --upgrade pip
Expand Down
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@ kaybee/pkged.go
kaybeeconf.yaml
prospector/.env
prospector/workspace.code-workspace
prospector/.env
prospector/disabled_tests/skip_test-commits.db
prospector/disabled_tests/skip_test-vulnerabilities.db
prospector/results
prospector/*.py
prospector/.vscode/launch.json
prospector/.vscode/settings.json
prospector/install_fastext.sh
Expand All @@ -45,7 +46,8 @@ prospector/client/cli/cov_html/*
prospector/client/web/node-app/node_modules
prospector/.coverage.*
prospector/.coverage
**/cov_html/*
**/cov_html
prospector/cov_html
.coverage
prospector/prospector.code-workspace
prospector/requests-cache.sqlite
Expand Down
2 changes: 1 addition & 1 deletion prospector/.flake8
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[flake8]
ignore = E203, E501, W503,F401,F403
ignore = E203, E501, W503,F401,F403,W605
exclude =
# No need to traverse our git directory
.git,
Expand Down
16 changes: 10 additions & 6 deletions prospector/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ test:

setup: requirements.txt
@echo "$(PROGRESS) Installing requirements"
pip install -r requirements.txt
@pip install -r requirements.txt
@echo "$(DONE) Installed requirements"
@echo "$(PROGRESS) Installing pre-commit and other modules"
@pre-commit install
Expand All @@ -26,7 +26,7 @@ dev-setup: setup requirements-dev.txt
@mkdir -p $(CVE_DATA_PATH)
@echo "$(DONE) Created directory $(CVE_DATA_PATH)"
@echo "$(PROGRESS) Installing development requirements"
pip install -r requirements-dev.txt
@pip install -r requirements-dev.txt
@echo "$(DONE) Installed development requirements"

docker-setup:
Expand Down Expand Up @@ -56,7 +56,11 @@ select-run:
python client/cli/main.py $(cve) --repository $(repository) --use-nvd

clean:
rm prospector-report.html
rm -f all.log* error.log*
rm -rf $(GIT_CACHE)/*
rm -rf __pycache__
@rm -f prospector.log
@rm -rf $(GIT_CACHE)/*
@rm -rf __pycache__
@rm -rf */__pycache__
@rm -rf */*/__pycache__
@rm -rf *report.html
@rm -rf *.json
@rm -rf requests-cache.sqlite
9 changes: 0 additions & 9 deletions prospector/api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +0,0 @@
import os

DB_CONNECT_STRING = "postgresql://{}:{}@{}:{}/{}".format(
os.environ["POSTGRES_USER"],
os.environ["POSTGRES_PASSWORD"],
os.environ["POSTGRES_HOST"],
os.environ["POSTGRES_PORT"],
os.environ["POSTGRES_DBNAME"],
).lower()
9 changes: 4 additions & 5 deletions prospector/api/api_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from fastapi.testclient import TestClient
import pytest

from api.main import app
from datamodel.commit import Commit
Expand All @@ -22,13 +21,13 @@ def test_status():
def test_post_preprocessed_commits():
commit_1 = Commit(
repository="https://github.com/apache/dubbo", commit_id="yyy"
).__dict__
).as_dict()
commit_2 = Commit(
repository="https://github.com/apache/dubbo", commit_id="zzz"
).__dict__
).as_dict()
commit_3 = Commit(
repository="https://github.com/apache/struts", commit_id="bbb"
).__dict__
).as_dict()
commits = [commit_1, commit_2, commit_3]
response = client.post("/commits/", json=commits)
assert response.status_code == 200
Expand All @@ -43,7 +42,7 @@ def test_get_specific_commit():
assert response.json()[0]["commit_id"] == commit_id


@pytest.mark.skip(reason="will raise exception")
# @pytest.mark.skip(reason="will raise exception")
def test_get_commits_by_repository():
repository = "https://github.com/apache/dubbo"
response = client.get("/commits/" + repository)
Expand Down
18 changes: 6 additions & 12 deletions prospector/api/main.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,12 @@
# import os

import uvicorn
from fastapi import FastAPI

# from fastapi import Depends
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse, RedirectResponse

# from .dependencies import oauth2_scheme
from api.routers import jobs, nvd, preprocessed, users

# from commitdb.postgres import PostgresCommitDB

# from pprint import pprint


# db = PostgresCommitDB()
# db.connect(DB_CONNECT_STRING)

api_metadata = [
{"name": "data", "description": "Operations with data used to train ML models."},
{
Expand Down Expand Up @@ -72,4 +61,9 @@ async def get_status():


if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=80)

uvicorn.run(
app,
host="0.0.0.0",
port=80,
)
5 changes: 2 additions & 3 deletions prospector/api/routers/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
from rq import Connection, Queue
from rq.job import Job

import log.util
from log.logger import logger
from api.routers.nvd_feed_update import main
from git.git import do_clone

_logger = log.util.init_local_logger()

redis_url = os.environ["REDIS_URL"]

Expand Down Expand Up @@ -57,7 +56,7 @@ async def get_job(job_id):
queue = Queue()
job = queue.fetch_job(job_id)
if job:
_logger.info("job {} result: {}".format(job.get_id(), job.result))
logger.info("job {} result: {}".format(job.get_id(), job.result))
response_object = {
"job_data": {
"job_id": job.get_id(),
Expand Down
18 changes: 8 additions & 10 deletions prospector/api/routers/nvd.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
from fastapi import APIRouter, HTTPException
from fastapi.responses import JSONResponse

import log.util

_logger = log.util.init_local_logger()
from log.logger import logger


router = APIRouter(
Expand All @@ -25,37 +23,37 @@

@router.get("/vulnerabilities/by-year/{year}")
async def get_vuln_list_by_year(year: str):
_logger.debug("Requested list of vulnerabilities for " + year)
logger.debug("Requested list of vulnerabilities for " + year)

if len(year) != 4 or not year.isdigit():
return JSONResponse([])

data_dir = os.path.join(DATA_PATH, year)
if not os.path.isdir(data_dir):
_logger.info("No data found for year " + year)
logger.info("No data found for year " + year)
raise HTTPException(
status_code=404, detail="No vulnerabilities found for " + year
)

_logger.debug("Serving data for year " + year)
logger.debug("Serving data for year " + year)
vuln_ids = [vid.rstrip(".json") for vid in os.listdir(data_dir)]
results = {"count": len(vuln_ids), "data": vuln_ids}
return JSONResponse(results)


@router.get("/vulnerabilities/{vuln_id}")
async def get_vuln_data(vuln_id):
_logger.debug("Requested data for vulnerability " + vuln_id)
logger.debug("Requested data for vulnerability " + vuln_id)

year = vuln_id.split("-")[1]
json_file = os.path.join(DATA_PATH, year, vuln_id.upper() + ".json")
if not os.path.isfile(json_file):
_logger.info("No file found: " + json_file)
logger.info("No file found: " + json_file)
raise HTTPException(
status_code=404, detail=json_file
) # detail="Vulnerability data not found")

_logger.debug("Serving file: " + json_file)
logger.debug("Serving file: " + json_file)
with open(json_file) as f:
data = json.loads(f.read())

Expand All @@ -64,7 +62,7 @@ async def get_vuln_data(vuln_id):

@router.get("/status")
async def status():
_logger.debug("Serving status page")
logger.debug("Serving status page")
out = dict()
metadata_file = os.path.join(DATA_PATH, "metadata.json")
if os.path.isfile(metadata_file):
Expand Down
45 changes: 21 additions & 24 deletions prospector/api/routers/nvd_feed_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@
import requests
from tqdm import tqdm

import log.util
from log.logger import logger

_logger = log.util.init_local_logger()

NVD_API_KEY = os.getenv("NVD_API_KEY", "")

# note: The NVD has not data older than 2002
START_FROM_YEAR = os.getenv("CVE_DATA_AS_OF_YEAR", "2002")
Expand All @@ -41,22 +42,20 @@ def do_update(quiet=False):
with open(os.path.join(DATA_PATH, "metadata.json"), "r") as f:
last_fetch_metadata = json.load(f)
if not quiet:
_logger.info("last fetch: " + last_fetch_metadata["sha256"])
logger.info("last fetch: " + last_fetch_metadata["sha256"])
except Exception:
last_fetch_metadata["sha256"] = ""
_logger.info(
logger.info(
"Could not read metadata about previous fetches"
" (this might be the first time we fetch data).",
exc_info=True,
)

# read metadata of new data from the NVD site
url = "https://nvd.nist.gov/feeds/json/cve/{}/nvdcve-{}-modified.meta".format(
FEED_SCHEMA_VERSION, FEED_SCHEMA_VERSION
)
r = requests.get(url)
url = "https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-modified.meta"
r = requests.get(url, params={"apiKey": NVD_API_KEY})
if r.status_code != 200:
_logger.error(
logger.error(
"Received status code {} when contacting {}.".format(r.status_code, url)
)
return False
Expand All @@ -67,12 +66,12 @@ def do_update(quiet=False):
d_split = d.split(":", 1)
metadata_dict[d_split[0]] = d_split[1].strip()
if not quiet:
_logger.info("current: " + metadata_dict["sha256"])
logger.info("current: " + metadata_dict["sha256"])

# check if the new data is actually new
if last_fetch_metadata["sha256"] == metadata_dict["sha256"]:
if not quiet:
_logger.info("We already have this update, no new data to fetch.")
logger.info("We already have this update, no new data to fetch.")
return False

do_fetch("modified")
Expand All @@ -86,30 +85,28 @@ def do_fetch_full(start_from_year=START_FROM_YEAR, quiet=False):
y for y in range(int(start_from_year), int(time.strftime("%Y")) + 1)
]
if not quiet:
_logger.info("Fetching feeds: " + str(years_to_fetch))
logger.info("Fetching feeds: " + str(years_to_fetch))

for y in years_to_fetch:
if not do_fetch(y):
_logger.error("Could not fetch data for year " + str(y))
logger.error("Could not fetch data for year " + str(y))


def do_fetch(what, quiet=True):
"""
the 'what' parameter can be a year or 'recent' or 'modified'
"""
url = "https://nvd.nist.gov/feeds/json/cve/{}/nvdcve-{}-{}.json.zip".format(
FEED_SCHEMA_VERSION, FEED_SCHEMA_VERSION, what
)
r = requests.get(url)
url = f"https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-{what}.json.zip"
r = requests.get(url, params={"apiKey": NVD_API_KEY})
if r.status_code != 200:
_logger.error(
logger.error(
"Received status code {} when contacting {}.".format(r.status_code, url)
)
return False

with closing(r), zipfile.ZipFile(io.BytesIO(r.content)) as archive:
for f in archive.infolist():
_logger.info(f.filename)
logger.info(f.filename)
data = json.loads(archive.read(f).decode())

if not quiet:
Expand All @@ -135,17 +132,17 @@ def need_full(quiet=False):
if os.path.exists(DATA_PATH) and os.path.isdir(DATA_PATH):
if not os.listdir(DATA_PATH):
if not quiet:
_logger.info("Data folder {} is empty".format(DATA_PATH))
logger.info("Data folder {} is empty".format(DATA_PATH))
return True

# Directory exists and is not empty
if not quiet:
_logger.info("Data folder found at " + DATA_PATH)
logger.info("Data folder found at " + DATA_PATH)
return False

# Directory doesn't exist
if not quiet:
_logger.info("Data folder {} does not exist".format(DATA_PATH))
logger.info("Data folder {} does not exist".format(DATA_PATH))
return True


Expand All @@ -162,5 +159,5 @@ def main(force, quiet):
do_update(quiet=quiet)


if __name__ == "__main__":
plac.call(main)
# if __name__ == "__main__":
# plac.call(main)
Loading

0 comments on commit 7284e34

Please sign in to comment.