Skip to content

Commit

Permalink
Scan archives zip and tar, scan binaries.
Browse files Browse the repository at this point in the history
  • Loading branch information
bartossh committed Aug 24, 2024
1 parent a6ea0bc commit b8e5aa1
Show file tree
Hide file tree
Showing 11 changed files with 789 additions and 90 deletions.
578 changes: 574 additions & 4 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ regex = "1.10.5"
serde = { version = "1.0.204", features = ["derive", "rc", "serde_derive"] }
serde_json = "1.0.120"
serde_yaml = "0.9.34"
tar = "0.4.41"
thiserror = "1.0.63"
walkdir = "2.5.0"
zip = "2.2.0"

[dev-dependencies]
criterion = "0.5.1"
Expand Down
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ Scanner aims to:
- [ ] Scan the git incrementally - from some date range, from some commit hash.
- [ ] Scan the git and identify authors - who introduced the secret.
- [ ] Scan the git for specified branches diff.
- [ ] Scan archives.
- [ ] Scann binaries.
- [x] Scan archives (tar, zip, jar).
- [x] Scann binaries.
- [ ] Scan Confulance and Jira.
- [ ] Scan slack.
- [ ] Scan Postgres database.
Expand All @@ -55,10 +55,10 @@ Scanner aims to:

## Test and build

- Test (will optimize for performance)
- Test (will optimize for performance), remember to use single thread as source is sharing single temp folder and flushes

```sh
cargo test -- --nocapture
cargo test -- --nocapture --test-threads-1
```
- Bench (optimized build)

Expand Down Expand Up @@ -103,8 +103,10 @@ Options:
--path <Path> Path to direcory to scan.
--config <Path> Path to config YAML file used for scanner configuration.
--omit <String> Space separated file patterns to ommit
--dedup De duplicates recurring secrets. De duplication happens in the order of scanners in the config file.
--nodeps Omits default dependencies such as npm, venv, gems, ect.
--dedup <u64> Level of de duplications. 0 or not specified - no dedup, 1 - file level dedup
--nodeps If specified omits default dependencies such as npm, venv, gems, ect.
--scan-archives If specified performs archive scanning.
--scan-binary If specified performs binary files scanning.
-h, --help Print help
-V, --version Print version
```
Expand Down
12 changes: 10 additions & 2 deletions src/executor/errors.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
use thiserror::Error;
use crate::source::errors::SourceError;
use crate::inspect::errors::InspectorError;
use std::io;
use zip::result::ZipError;

/// ExecutorError describes all errors that can occure in Executor.
///
#[derive(Error, Debug)]
pub enum ExecutorError {
#[error("failed due to inspector failuer: {0}")]
#[error("failed due to inspector failuer, {0}")]
InspectorFailure(#[from] InspectorError),
#[error("failed due to source io failuer: {0}")]
#[error("failed due to source io failuer, {0}")]
GitSourceIoFailure(#[from] SourceError),
#[error("failed due to parameter is lacking, {0}")]
FileIoFailure(#[from] io::Error),
#[error("failed due file io error, {0}")]
ZipArchiveFailure(#[from] ZipError),
#[error("failed due zip archive reading error, {0}")]
WrongParameterFailure(String),
#[error("filed due to unexpected error, {0}")]
Unexpected(String),
}
118 changes: 105 additions & 13 deletions src/executor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@ pub mod errors;

use std::{collections::HashSet, path::PathBuf, sync::Arc};
use crossbeam_channel::{unbounded, Sender, Receiver};
use std::fs::read_to_string;
use std::{fs::{read, read_to_string, File}, io::prelude::*};
use rayon::iter::ParallelBridge;
use rayon::prelude::ParallelIterator;
use std::thread::spawn;
use errors::ExecutorError;
use crate::{inspect::Inspector, reporter::Input, source::{BranchLevel, DataSource, Source, Repository, Filesystem}};
use crate::{
inspect::Inspector,
reporter::Input,
source::{
BranchLevel, DataSource, DirectoryProvider, RepositoryProvider, Source,
},
};
use zip::ZipArchive;
use tar::Archive;

const GUESS_OMIT_SIZE: usize = 64;
const FILE_SYSTEM: &str = "------ FILE SYSTEM ------";
Expand All @@ -30,6 +38,8 @@ pub struct Config<'a> {
pub branch_level: BranchLevel,
pub branches: &'a Option<Vec<String>>,
pub sx_input: Sender<Option<Input>>,
pub decompress: bool,
pub scan_binary: bool,
}

/// Executes the scanners with given setup.
Expand All @@ -41,6 +51,8 @@ pub struct Executor {
branch_level: BranchLevel,
branches: Option<HashSet<String>>,
sx_input: Sender<Option<Input>>,
decompress: bool,
scan_binary: bool,
}

impl Executor {
Expand All @@ -50,7 +62,7 @@ impl Executor {
) -> Result<Self, ExecutorError> {
let mut source = match cfg.data_source {
DataSource::Git => Source::new_git(cfg.path, cfg.url)?,
DataSource::FileSystem => Source::new_filesystem(cfg.path)?,
DataSource::FileSystem => Source::new_filesystem_local(cfg.path)?,
};

let config_path = match cfg.config {
Expand Down Expand Up @@ -85,11 +97,13 @@ impl Executor {
branch_level: cfg.branch_level,
branches: if let Some(branches) = cfg.branches { Some(branches.into_iter().map(|v| v.to_owned()).collect::<HashSet<String>>()) } else { None },
sx_input: cfg.sx_input.clone(),
decompress: cfg.decompress,
scan_binary: cfg.scan_binary,
})
}

#[inline(always)]
pub fn execute(&mut self) {
pub fn execute(&mut self) -> Result<(), ExecutorError>{
let mut branches_to_scan = Vec::new();
match &self.branch_level {
BranchLevel::Head => branches_to_scan.push(FILE_SYSTEM.to_string()),
Expand All @@ -108,7 +122,8 @@ impl Executor {
for branch in branches_to_scan.iter() {
let (sx_data, rx_data): (Sender<Option<DataWithInfo>>, Receiver<Option<DataWithInfo>>) = unbounded();
if branch == FILE_SYSTEM {
self.walk_dir(sx_data);
self.walk_dir(sx_data)?;
self.process(rx_data, &branch);
break;
}
if let Some(branches) = &self.branches {
Expand All @@ -123,22 +138,26 @@ impl Executor {
},
};
let branch = branch.to_string().clone();
self.walk_dir(sx_data);
self.walk_dir(sx_data)?;
self.process(rx_data, &branch);
}

let _ = self.sx_input.send(None);
let _ = self.source.flush();

Ok(())
}

#[inline(always)]
fn walk_dir(&self, sx: Sender<Option<DataWithInfo>>) {
fn walk_dir(&self, sx: Sender<Option<DataWithInfo>>) -> Result<(), ExecutorError>{
let Some(walk_dir) = self.source.walk_dir() else {
let _ = self.sx_input.send(None);
return;
return Err(ExecutorError::Unexpected("unable to walk directory".to_string()));
};

let omit = self.omit.clone();
let decompress = self.decompress;
let read_binary = self.scan_binary;

spawn( move || {
'walker: for entry in walk_dir {
Expand All @@ -157,14 +176,12 @@ impl Executor {
}

let entry = entry.into_path();
let Ok(file_data) = read_to_string(&entry) else {
continue;
};

let _ = sx.send(Some(DataWithInfo{data: file_data, file_name: entry.as_path().to_str().unwrap_or_default().to_string()}));
let _ = extract_utf8_and_send(&sx, &entry, decompress, read_binary); // TODO: Introduce error channel.
}
let _ = sx.send(None);
});

Ok(())
}

#[inline(always)]
Expand All @@ -181,3 +198,78 @@ impl Executor {
});
}
}

#[inline(always)]
fn extract_utf8_and_send(sx: &Sender<Option<DataWithInfo>>, path: &PathBuf, decompress: bool, scan_binary: bool) -> Result<(), ExecutorError> {
let file_name = path.as_path().to_str().unwrap_or_default().to_string();
if decompress {
return match &file_name[file_name.len()-3..file_name.len()] {
"tar" | ".gz" => decomopress_tar_archive_and_send(sx, path, file_name),
"zip" | "jar" | "bz2" => decomopress_zip_archive_and_send(sx, path, file_name),
"zst" | "rar" | "iso" | ".rz" | ".7z" | "s7z" | "aar" | "apk" => Ok(()),
_ => read_and_send(sx, path, file_name, scan_binary),
}
}
read_and_send(sx, path, file_name, scan_binary)
}

#[inline(always)]
fn read_and_send(sx: &Sender<Option<DataWithInfo>>, path: &PathBuf, file_name: String, scan_binary: bool) -> Result<(), ExecutorError> {
match scan_binary {
true => read_binaryfile_and_send_to_channel(sx, path, file_name),
false => read_textfile_and_send_to_channel(sx, path, file_name),
}
}

#[inline(always)]
fn read_textfile_and_send_to_channel(sx: &Sender<Option<DataWithInfo>>, path: &PathBuf, file_name: String) -> Result<(), ExecutorError> {
let data = read_to_string(path)?;
let _ = sx.send(Some(DataWithInfo{data , file_name}));

Ok(())
}

#[inline(always)]
fn read_binaryfile_and_send_to_channel(sx: &Sender<Option<DataWithInfo>>, path: &PathBuf, file_name: String) -> Result<(), ExecutorError> {
let bytes = read(&path)?;
let data = String::from_utf8_lossy(&bytes).to_string();
let _ = sx.send(Some(DataWithInfo{data , file_name}));

Ok(())
}

#[inline(always)]
fn decomopress_zip_archive_and_send(sx: &Sender<Option<DataWithInfo>>, path: &PathBuf, file_name: String) -> Result<(), ExecutorError> {
let file = File::open(path)?;
let mut zip = ZipArchive::new(&file)?;

for i in 0..zip.len() {
let mut data = String::new();
let mut file = zip.by_index(i)?;
let file_name = format!("{}/{}", file_name, file.name());
let Ok(_) = file.read_to_string(&mut data) else {
continue;
};
let _ = sx.send(Some(DataWithInfo{data, file_name}));
}

Ok(())
}

#[inline(always)]
fn decomopress_tar_archive_and_send(sx: &Sender<Option<DataWithInfo>>, path: &PathBuf, file_name: String) -> Result<(), ExecutorError> {
let file = File::open(path)?;
let mut t = Archive::new(&file);

for file in t.entries()? {
let mut file = file?;
let mut data = String::new();
let file_name = format!("{}/{}", file_name, file.path().unwrap_or_default().to_str().unwrap_or_default());
let Ok(_) = file.read_to_string(&mut data) else {
continue;
};
let _ = sx.send(Some(DataWithInfo{data, file_name}));
}

Ok(())
}
26 changes: 23 additions & 3 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ fn main() {
arg!(--"dedup" <u64> "Level of de duplications. 0 or not specified - no dedup, 1 - file level dedup").value_parser(value_parser!(u8)),
).arg(
arg!(--"nodeps" "If specified omits default dependencies such as npm, venv, gems, ect."),
).arg(
arg!(--"scan-archives" "If specified performs archive scanning."),
).arg(
arg!(--"scan-binary" "If specified performs binary files scanning."),
))
.subcommand(
command!("git")
Expand All @@ -49,6 +53,10 @@ fn main() {
arg!(--"scan-remote" "If specified scans all remote brancheses."),
).arg(
arg!(--"branches" <String> "If specified scans branches from the given list, otherwise HEAD is scanned or all branches with flag --scan-local or -scan-remote."),
).arg(
arg!(--"scan-archives" "If specified performs archive scanning."),
).arg(
arg!(--"scan-binary" "If specified performs binary files scanning."),
));
let matches = cmd.get_matches();
match matches.subcommand() {
Expand All @@ -64,6 +72,8 @@ fn main() {
None,
None,
None,
matches.get_one("scan-archives"),
matches.get_one("scan-binary"),
) {
Ok(s) => println!("πŸŽ‰ {s}"),
Err(e) => println!("🀷 Failure {}", e.to_string()),
Expand All @@ -81,6 +91,8 @@ fn main() {
matches.get_one("scan-local"),
matches.get_one("scan-remote"),
matches.get_one("branches"),
matches.get_one("scan-archives"),
matches.get_one("scan-binary"),
) {
Ok(s) => println!("[ πŸŽ‰ {} ]", s),
Err(e) => println!("[ 🀷 {} ]", e.to_string()),
Expand All @@ -102,6 +114,8 @@ fn scan(
local: Option<&bool>,
remote: Option<&bool>,
branches: Option<&String>,
decompress: Option<&bool>,
read_binary: Option<&bool>,
) -> Result<String, Error> {
let dedup = dedup.unwrap_or(&0);
let nodeps = match nodeps {
Expand All @@ -121,7 +135,10 @@ fn scan(
None => None,
};

let mut executor = match Executor::new(&Config{data_source, path, url, config, omit, nodeps, branch_level, branches, sx_input}){
let decompress = if let Some(d) = decompress { *d }else{ false };
let read_binary = if let Some(d) = read_binary { *d }else{ false };

let mut executor = match Executor::new(&Config{data_source, path, url, config, omit, nodeps, branch_level, branches, sx_input, decompress, scan_binary: read_binary}){
Ok(e) => Ok(e),
Err(e) => Err(Error::raw(ErrorKind::InvalidValue, e)),
}?;
Expand All @@ -136,11 +153,14 @@ fn scan(
drop(wg_print_clone);
});

executor.execute();
let result = executor.execute();

wg_print.wait();

return Ok("Success".to_owned());
match result {
Ok(()) => Ok("Success".to_owned()),
Err(e) => Err(Error::raw(ErrorKind::Format, e)),
}
}

#[inline(always)]
Expand Down
2 changes: 1 addition & 1 deletion src/reporter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use std::time::Instant;
const REPORT_HEADER: &str = "[ πŸ“‹ SCANNING REPORT πŸ“‹ ]";
const REPORT_FOOTER: &str = "[ πŸ“‹ --------------- πŸ“‹ ]";
const GUESS_ANALITICS_CAPACITY: usize = 4096;
const GUESS_CACHE_CAPACITY: usize = 1024 * 10000 * 8; // 10MB
const GUESS_CACHE_CAPACITY: usize = 1024 * 1000 * 8; // 1MB

/// ReportWrite compounds trait Write and Debug.
///
Expand Down
10 changes: 6 additions & 4 deletions src/source/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@ use std::io;
///
#[derive(Error, Debug)]
pub enum SourceError {
#[error("failed due to file io failuer: {0}")]
#[error("failed due to file io failuer, {0}")]
FileIoFailure(#[from] io::Error),
#[error("failed due to git io failuer: {0}")]
#[error("failed due to git io failuer, {0}")]
Git2IoFailure(#[from] git2::Error),
#[error("failed due to async decomression failure, {0}")]
AsyncDecompresionFailure(String),
#[error("failed due to parameter is lacking, {0}")]
ParameterFailure(String),
#[error("failed due to source not ready, {0}")]
SourceNotReady(String),
#[error("failed due to not being able to access the branch: {0}")]
GitSourceNotReady(String),
#[error("failed due to not being able to access the {0} branch")]
BranchNotAccessible(String),
}
Loading

0 comments on commit b8e5aa1

Please sign in to comment.