Initial commit, basic indexing and plain text parsing

This commit is contained in:
2025-09-26 16:08:41 -06:00
commit 40d1e710ba
8 changed files with 1715 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/target

1315
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

15
Cargo.toml Normal file
View File

@@ -0,0 +1,15 @@
[package]
name = "sift"
version = "0.1.0"
edition = "2024"
[dependencies]
atty = "0.2.14"
chrono = "0.4.42"
clap = "4.5.48"
color-eyre = "0.6.5"
crossterm = "0.29.0"
eyre = "0.6.12"
ordered-float = "5.0.0"
ratatui = "0.29.0"
tracing = "0.1.41"

128
bacon.toml Normal file
View File

@@ -0,0 +1,128 @@
# This is a configuration file for the bacon tool
#
# Complete help on configuration: https://dystroy.org/bacon/config/
#
# You may check the current default at
# https://github.com/Canop/bacon/blob/main/defaults/default-bacon.toml
default_job = "check"
env.CARGO_TERM_COLOR = "always"
[jobs.check]
command = ["cargo", "check"]
need_stdout = false
[jobs.check-all]
command = ["cargo", "check", "--all-targets"]
need_stdout = false
# Run clippy on the default target
[jobs.clippy]
command = [
"cargo", "clippy",
"--",
"-W", "clippy::pedantic",
"-W", "clippy::nursery",
"-W", "clippy::unwrap_used",
"-W", "clippy::expect_used"
]
need_stdout = false
# Run clippy on all targets
# To disable some lints, you may change the job this way:
# [jobs.clippy-all]
# command = [
# "cargo", "clippy",
# "--all-targets",
# "--",
# "-A", "clippy::bool_to_int_with_if",
# "-A", "clippy::collapsible_if",
# "-A", "clippy::derive_partial_eq_without_eq",
# ]
# need_stdout = false
[jobs.clippy-all]
command = [
"cargo", "clippy", "--all-targets",
"--",
"-W", "clippy::pedantic",
"-W", "clippy::nursery",
"-W", "clippy::unwrap_used",
"-W", "clippy::expect_used"
]
need_stdout = false
# This job lets you run
# - all tests: bacon test
# - a specific test: bacon test -- config::test_default_files
# - the tests of a package: bacon test -- -- -p config
[jobs.test]
command = ["cargo", "test"]
need_stdout = true
[jobs.nextest]
command = [
"cargo", "nextest", "run",
"--hide-progress-bar", "--failure-output", "final"
]
need_stdout = true
analyzer = "nextest"
[jobs.doc]
command = ["cargo", "doc", "--no-deps"]
need_stdout = false
# If the doc compiles, then it opens in your browser and bacon switches
# to the previous job
[jobs.doc-open]
command = ["cargo", "doc", "--no-deps", "--open"]
need_stdout = false
on_success = "back" # so that we don't open the browser at each change
# You can run your application and have the result displayed in bacon,
# if it makes sense for this crate.
[jobs.run]
command = [
"cargo", "run",
# put launch parameters for your program behind a `--` separator
]
need_stdout = true
allow_warnings = true
background = true
# Run your long-running application (eg server) and have the result displayed in bacon.
# For programs that never stop (eg a server), `background` is set to false
# to have the cargo run output immediately displayed instead of waiting for
# program's end.
# 'on_change_strategy' is set to `kill_then_restart` to have your program restart
# on every change (an alternative would be to use the 'F5' key manually in bacon).
# If you often use this job, it makes sense to override the 'r' key by adding
# a binding `r = job:run-long` at the end of this file .
# A custom kill command such as the one suggested below is frequently needed to kill
# long running programs (uncomment it if you need it)
[jobs.run-long]
command = [
"cargo", "run",
# put launch parameters for your program behind a `--` separator
]
need_stdout = true
allow_warnings = true
background = false
on_change_strategy = "kill_then_restart"
# kill = ["pkill", "-TERM", "-P"]
# This parameterized job runs the example of your choice, as soon
# as the code compiles.
# Call it as
# bacon ex -- my-example
[jobs.ex]
command = ["cargo", "run", "--example"]
need_stdout = true
allow_warnings = true
# You may define here keybindings that would be specific to
# a project, for example a shortcut to launch a specific job.
# Shortcuts to internal functions (scrolling, toggling, etc.)
# should go in your personal global prefs.toml file instead.
[keybindings]
# alt-m = "job:my-job"
c = "job:clippy-all" # comment this to have 'c' run clippy on only the default target

2
src/lib.rs Normal file
View File

@@ -0,0 +1,2 @@
pub mod log_parser;
pub mod log_database;

170
src/log_database.rs Normal file
View File

@@ -0,0 +1,170 @@
use std::collections::{HashMap, HashSet, BTreeMap};
use ordered_float::OrderedFloat;
use crate::log_parser::{LogParser, ParseError};
pub type LogIdx = usize;
pub enum LogValue {
String(String),
Integer(i64),
Float(f64),
Boolean(bool),
Array(Box<Vec<LogValue>>),
Map(Box<HashMap<String, LogValue>>),
}
pub struct LogField {
// array indexes and number fields are converted to strings
pub property: Option<String>,
pub value: LogValue,
}
pub struct Log {
pub fields: Vec<LogField>,
}
pub struct LogDatabase<T: LogParser> {
parser: T,
logs_raw: Vec<String>,
/// maps property names to the set of all structured logs that have the property
/// property names are always converted to strings
/// nested properties are additionally indexed with each parent property prefixes, using slash notation
/// Example: JSON { a: { b: c: 1 } } is indexed with the properties "c", "b/c", and "a/b/c"
property_index: HashMap<String, HashSet<LogIdx>>,
reverse_index: ReverseIndex,
}
impl<T: LogParser> LogDatabase<T> {
pub fn new(parser: T) -> LogDatabase<T> {
LogDatabase {
parser,
logs_raw: Vec::new(),
property_index: HashMap::new(),
reverse_index: ReverseIndex::new(),
}
}
pub fn ingest(&mut self, line: String) -> Result<LogIdx, ParseError> {
let log = self.parser.parse_line(&line)?;
let idx = self.logs_raw.len();
for field in log.fields.iter() {
self.index_field(idx, &field.property, &field.value);
}
self.logs_raw.push(line);
Ok(idx)
}
fn index_field(&mut self, idx: LogIdx, property: &Option<String>, value: &LogValue) {
if let Some(property) = property {
// add to property index
let entry = self.property_index.get_mut(property);
if let Some(idxs) = entry {
idxs.insert(idx);
}
else {
let mut idxs = HashSet::new();
idxs.insert(idx);
self.property_index.insert(property.clone(), idxs);
}
}
// add to reverse index
let field_location = FieldLocation { log_idx: idx, property: property.clone() };
match value {
LogValue::String(value) => {
let entry = self.reverse_index.strings.get_mut(value);
if let Some(field_locations) = entry {
field_locations.insert(field_location);
}
else {
let mut field_locations = HashSet::new();
field_locations.insert(field_location);
self.reverse_index.strings.insert(value.clone(), field_locations);
}
},
LogValue::Integer(value) => {
let entry = self.reverse_index.integers.get_mut(value);
if let Some(field_locations) = entry {
field_locations.insert(field_location);
}
else {
let mut field_locations = HashSet::new();
field_locations.insert(field_location);
self.reverse_index.integers.insert(*value, field_locations);
}
},
LogValue::Float(value) => {
let ordered = OrderedFloat(*value);
let entry = self.reverse_index.floats.get_mut(&ordered);
if let Some(field_locations) = entry {
field_locations.insert(field_location);
}
else {
let mut field_locations = HashSet::new();
field_locations.insert(field_location);
self.reverse_index.floats.insert(ordered, field_locations);
}
},
LogValue::Boolean(value) => {
let entry = self.reverse_index.booleans.get_mut(value);
if let Some(field_locations) = entry {
field_locations.insert(field_location);
}
else {
let mut field_locations = HashSet::new();
field_locations.insert(field_location);
self.reverse_index.booleans.insert(*value, field_locations);
}
},
LogValue::Array(value) => {
for (index, value) in value.iter().enumerate() {
let property = Some(index.to_string());
self.index_field(idx, &property, value);
}
},
LogValue::Map(value) => {
for (key, value) in value.iter() {
let property = Some(key.clone());
self.index_field(idx, &property, value);
}
},
}
}
// TODO: implemnt as generic for LogValue types??
pub fn find_value_string(&self, value: &str) -> impl Iterator<Item = &FieldLocation> {
self.reverse_index.strings.get(value).into_iter().flatten()
}
}
struct ReverseIndex {
strings: HashMap<String, HashSet<FieldLocation>>,
integers: BTreeMap<i64, HashSet<FieldLocation>>,
floats: BTreeMap<OrderedFloat<f64>, HashSet<FieldLocation>>,
booleans: HashMap<bool, HashSet<FieldLocation>>,
}
#[derive(PartialEq, Eq, Hash, Debug)]
pub struct FieldLocation {
log_idx: LogIdx,
property: Option<String>,
}
impl ReverseIndex {
fn new() -> ReverseIndex {
ReverseIndex {
strings: HashMap::new(),
integers: BTreeMap::new(),
floats: BTreeMap::new(),
booleans: HashMap::new(),
}
}
}

56
src/log_parser.rs Normal file
View File

@@ -0,0 +1,56 @@
use crate::log_database::{Log, LogField, LogValue};
pub trait LogParser {
fn parse_line(&self, line: &str) -> Result<Log, ParseError>;
}
#[derive(Debug)]
pub struct ParseError {
pub message: &'static str,
pub col: u32,
}
// a Log consists of multiple keywords, which are indexed into the keyword database
// each keyword has a data type, as well as text positional textual information for syntax highlighting
// data can also be structured, so keywords need to know their position in the data structure for targetted queries
pub struct TextParser<'a> {
pub separator: Option<&'a str>,
}
/// Parses unstructured text word by word
impl<'a> LogParser for TextParser<'a> {
fn parse_line(&self, line: &str) -> Result<Log, ParseError> {
let mut split_iter;
let mut split_whitespace_iter;
let words: &mut dyn Iterator<Item = &str> = match self.separator {
Some(separator) => {
split_iter = line.split(separator);
&mut split_iter
},
None => {
split_whitespace_iter = line.split_whitespace();
&mut split_whitespace_iter
},
};
let fields: Vec<LogField> = words
.map(|word| LogField {
property: None,
value: LogValue::String(String::from(word))
}).collect();
Ok(Log {fields})
}
}
// pub struct JsonParser {
// }
// impl LogParser for JsonParser {
// fn parse_line(&self, line: &str) -> Result<Log, ParseError> {
// Ok(Log {})
// }
// }

28
src/main.rs Normal file
View File

@@ -0,0 +1,28 @@
// use std::io::{self, BufRead};
// use atty::Stream;
use sift::log_database::LogDatabase;
use sift::log_parser::TextParser;
fn main() {
// if atty::is(Stream::Stdin) {
// println!("no pipe");
// return;
// }
// let stdin = io::stdin();
// for line in stdin.lock().lines() {
// let line = line.expect("Failed to read line from stdin");
// println!("{}", line);
// }
let mut log_database = LogDatabase::new(TextParser { separator: None });
log_database.ingest(String::from("hello world")).unwrap();
log_database.ingest(String::from("have a good hello")).unwrap();
log_database.ingest(String::from("goodbye world")).unwrap();
println!("hello {:?}", log_database.find_value_string("hello").collect::<Vec<_>>());
println!("have {:?}", log_database.find_value_string("have").collect::<Vec<_>>());
println!("world {:?}", log_database.find_value_string("world").collect::<Vec<_>>());
println!("elliot {:?}", log_database.find_value_string("elliot").collect::<Vec<_>>());
}