Initial commit, basic indexing and plain text parsing
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/target
|
||||
1315
Cargo.lock
generated
Normal file
1315
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
15
Cargo.toml
Normal file
15
Cargo.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
[package]
|
||||
name = "sift"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
atty = "0.2.14"
|
||||
chrono = "0.4.42"
|
||||
clap = "4.5.48"
|
||||
color-eyre = "0.6.5"
|
||||
crossterm = "0.29.0"
|
||||
eyre = "0.6.12"
|
||||
ordered-float = "5.0.0"
|
||||
ratatui = "0.29.0"
|
||||
tracing = "0.1.41"
|
||||
128
bacon.toml
Normal file
128
bacon.toml
Normal file
@@ -0,0 +1,128 @@
|
||||
# This is a configuration file for the bacon tool
|
||||
#
|
||||
# Complete help on configuration: https://dystroy.org/bacon/config/
|
||||
#
|
||||
# You may check the current default at
|
||||
# https://github.com/Canop/bacon/blob/main/defaults/default-bacon.toml
|
||||
|
||||
default_job = "check"
|
||||
env.CARGO_TERM_COLOR = "always"
|
||||
|
||||
[jobs.check]
|
||||
command = ["cargo", "check"]
|
||||
need_stdout = false
|
||||
|
||||
[jobs.check-all]
|
||||
command = ["cargo", "check", "--all-targets"]
|
||||
need_stdout = false
|
||||
|
||||
# Run clippy on the default target
|
||||
[jobs.clippy]
|
||||
command = [
|
||||
"cargo", "clippy",
|
||||
"--",
|
||||
"-W", "clippy::pedantic",
|
||||
"-W", "clippy::nursery",
|
||||
"-W", "clippy::unwrap_used",
|
||||
"-W", "clippy::expect_used"
|
||||
]
|
||||
need_stdout = false
|
||||
|
||||
# Run clippy on all targets
|
||||
# To disable some lints, you may change the job this way:
|
||||
# [jobs.clippy-all]
|
||||
# command = [
|
||||
# "cargo", "clippy",
|
||||
# "--all-targets",
|
||||
# "--",
|
||||
# "-A", "clippy::bool_to_int_with_if",
|
||||
# "-A", "clippy::collapsible_if",
|
||||
# "-A", "clippy::derive_partial_eq_without_eq",
|
||||
# ]
|
||||
# need_stdout = false
|
||||
[jobs.clippy-all]
|
||||
command = [
|
||||
"cargo", "clippy", "--all-targets",
|
||||
"--",
|
||||
"-W", "clippy::pedantic",
|
||||
"-W", "clippy::nursery",
|
||||
"-W", "clippy::unwrap_used",
|
||||
"-W", "clippy::expect_used"
|
||||
]
|
||||
need_stdout = false
|
||||
|
||||
# This job lets you run
|
||||
# - all tests: bacon test
|
||||
# - a specific test: bacon test -- config::test_default_files
|
||||
# - the tests of a package: bacon test -- -- -p config
|
||||
[jobs.test]
|
||||
command = ["cargo", "test"]
|
||||
need_stdout = true
|
||||
|
||||
[jobs.nextest]
|
||||
command = [
|
||||
"cargo", "nextest", "run",
|
||||
"--hide-progress-bar", "--failure-output", "final"
|
||||
]
|
||||
need_stdout = true
|
||||
analyzer = "nextest"
|
||||
|
||||
[jobs.doc]
|
||||
command = ["cargo", "doc", "--no-deps"]
|
||||
need_stdout = false
|
||||
|
||||
# If the doc compiles, then it opens in your browser and bacon switches
|
||||
# to the previous job
|
||||
[jobs.doc-open]
|
||||
command = ["cargo", "doc", "--no-deps", "--open"]
|
||||
need_stdout = false
|
||||
on_success = "back" # so that we don't open the browser at each change
|
||||
|
||||
# You can run your application and have the result displayed in bacon,
|
||||
# if it makes sense for this crate.
|
||||
[jobs.run]
|
||||
command = [
|
||||
"cargo", "run",
|
||||
# put launch parameters for your program behind a `--` separator
|
||||
]
|
||||
need_stdout = true
|
||||
allow_warnings = true
|
||||
background = true
|
||||
|
||||
# Run your long-running application (eg server) and have the result displayed in bacon.
|
||||
# For programs that never stop (eg a server), `background` is set to false
|
||||
# to have the cargo run output immediately displayed instead of waiting for
|
||||
# program's end.
|
||||
# 'on_change_strategy' is set to `kill_then_restart` to have your program restart
|
||||
# on every change (an alternative would be to use the 'F5' key manually in bacon).
|
||||
# If you often use this job, it makes sense to override the 'r' key by adding
|
||||
# a binding `r = job:run-long` at the end of this file .
|
||||
# A custom kill command such as the one suggested below is frequently needed to kill
|
||||
# long running programs (uncomment it if you need it)
|
||||
[jobs.run-long]
|
||||
command = [
|
||||
"cargo", "run",
|
||||
# put launch parameters for your program behind a `--` separator
|
||||
]
|
||||
need_stdout = true
|
||||
allow_warnings = true
|
||||
background = false
|
||||
on_change_strategy = "kill_then_restart"
|
||||
# kill = ["pkill", "-TERM", "-P"]
|
||||
|
||||
# This parameterized job runs the example of your choice, as soon
|
||||
# as the code compiles.
|
||||
# Call it as
|
||||
# bacon ex -- my-example
|
||||
[jobs.ex]
|
||||
command = ["cargo", "run", "--example"]
|
||||
need_stdout = true
|
||||
allow_warnings = true
|
||||
|
||||
# You may define here keybindings that would be specific to
|
||||
# a project, for example a shortcut to launch a specific job.
|
||||
# Shortcuts to internal functions (scrolling, toggling, etc.)
|
||||
# should go in your personal global prefs.toml file instead.
|
||||
[keybindings]
|
||||
# alt-m = "job:my-job"
|
||||
c = "job:clippy-all" # comment this to have 'c' run clippy on only the default target
|
||||
2
src/lib.rs
Normal file
2
src/lib.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
pub mod log_parser;
|
||||
pub mod log_database;
|
||||
170
src/log_database.rs
Normal file
170
src/log_database.rs
Normal file
@@ -0,0 +1,170 @@
|
||||
use std::collections::{HashMap, HashSet, BTreeMap};
|
||||
|
||||
use ordered_float::OrderedFloat;
|
||||
|
||||
use crate::log_parser::{LogParser, ParseError};
|
||||
|
||||
pub type LogIdx = usize;
|
||||
|
||||
pub enum LogValue {
|
||||
String(String),
|
||||
Integer(i64),
|
||||
Float(f64),
|
||||
Boolean(bool),
|
||||
Array(Box<Vec<LogValue>>),
|
||||
Map(Box<HashMap<String, LogValue>>),
|
||||
}
|
||||
|
||||
pub struct LogField {
|
||||
// array indexes and number fields are converted to strings
|
||||
pub property: Option<String>,
|
||||
pub value: LogValue,
|
||||
}
|
||||
|
||||
pub struct Log {
|
||||
pub fields: Vec<LogField>,
|
||||
}
|
||||
|
||||
pub struct LogDatabase<T: LogParser> {
|
||||
parser: T,
|
||||
|
||||
logs_raw: Vec<String>,
|
||||
/// maps property names to the set of all structured logs that have the property
|
||||
/// property names are always converted to strings
|
||||
/// nested properties are additionally indexed with each parent property prefixes, using slash notation
|
||||
/// Example: JSON { a: { b: c: 1 } } is indexed with the properties "c", "b/c", and "a/b/c"
|
||||
property_index: HashMap<String, HashSet<LogIdx>>,
|
||||
reverse_index: ReverseIndex,
|
||||
}
|
||||
|
||||
impl<T: LogParser> LogDatabase<T> {
|
||||
pub fn new(parser: T) -> LogDatabase<T> {
|
||||
LogDatabase {
|
||||
parser,
|
||||
|
||||
logs_raw: Vec::new(),
|
||||
property_index: HashMap::new(),
|
||||
reverse_index: ReverseIndex::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ingest(&mut self, line: String) -> Result<LogIdx, ParseError> {
|
||||
let log = self.parser.parse_line(&line)?;
|
||||
let idx = self.logs_raw.len();
|
||||
for field in log.fields.iter() {
|
||||
self.index_field(idx, &field.property, &field.value);
|
||||
}
|
||||
|
||||
self.logs_raw.push(line);
|
||||
|
||||
Ok(idx)
|
||||
}
|
||||
|
||||
fn index_field(&mut self, idx: LogIdx, property: &Option<String>, value: &LogValue) {
|
||||
if let Some(property) = property {
|
||||
// add to property index
|
||||
let entry = self.property_index.get_mut(property);
|
||||
if let Some(idxs) = entry {
|
||||
idxs.insert(idx);
|
||||
}
|
||||
else {
|
||||
let mut idxs = HashSet::new();
|
||||
idxs.insert(idx);
|
||||
self.property_index.insert(property.clone(), idxs);
|
||||
}
|
||||
}
|
||||
|
||||
// add to reverse index
|
||||
let field_location = FieldLocation { log_idx: idx, property: property.clone() };
|
||||
match value {
|
||||
LogValue::String(value) => {
|
||||
let entry = self.reverse_index.strings.get_mut(value);
|
||||
|
||||
if let Some(field_locations) = entry {
|
||||
field_locations.insert(field_location);
|
||||
}
|
||||
else {
|
||||
let mut field_locations = HashSet::new();
|
||||
field_locations.insert(field_location);
|
||||
self.reverse_index.strings.insert(value.clone(), field_locations);
|
||||
}
|
||||
},
|
||||
LogValue::Integer(value) => {
|
||||
let entry = self.reverse_index.integers.get_mut(value);
|
||||
if let Some(field_locations) = entry {
|
||||
field_locations.insert(field_location);
|
||||
}
|
||||
else {
|
||||
let mut field_locations = HashSet::new();
|
||||
field_locations.insert(field_location);
|
||||
self.reverse_index.integers.insert(*value, field_locations);
|
||||
}
|
||||
},
|
||||
LogValue::Float(value) => {
|
||||
let ordered = OrderedFloat(*value);
|
||||
let entry = self.reverse_index.floats.get_mut(&ordered);
|
||||
if let Some(field_locations) = entry {
|
||||
field_locations.insert(field_location);
|
||||
}
|
||||
else {
|
||||
let mut field_locations = HashSet::new();
|
||||
field_locations.insert(field_location);
|
||||
self.reverse_index.floats.insert(ordered, field_locations);
|
||||
}
|
||||
},
|
||||
LogValue::Boolean(value) => {
|
||||
let entry = self.reverse_index.booleans.get_mut(value);
|
||||
if let Some(field_locations) = entry {
|
||||
field_locations.insert(field_location);
|
||||
}
|
||||
else {
|
||||
let mut field_locations = HashSet::new();
|
||||
field_locations.insert(field_location);
|
||||
self.reverse_index.booleans.insert(*value, field_locations);
|
||||
}
|
||||
},
|
||||
LogValue::Array(value) => {
|
||||
for (index, value) in value.iter().enumerate() {
|
||||
let property = Some(index.to_string());
|
||||
self.index_field(idx, &property, value);
|
||||
}
|
||||
},
|
||||
LogValue::Map(value) => {
|
||||
for (key, value) in value.iter() {
|
||||
let property = Some(key.clone());
|
||||
self.index_field(idx, &property, value);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: implemnt as generic for LogValue types??
|
||||
pub fn find_value_string(&self, value: &str) -> impl Iterator<Item = &FieldLocation> {
|
||||
self.reverse_index.strings.get(value).into_iter().flatten()
|
||||
}
|
||||
}
|
||||
|
||||
struct ReverseIndex {
|
||||
strings: HashMap<String, HashSet<FieldLocation>>,
|
||||
integers: BTreeMap<i64, HashSet<FieldLocation>>,
|
||||
floats: BTreeMap<OrderedFloat<f64>, HashSet<FieldLocation>>,
|
||||
booleans: HashMap<bool, HashSet<FieldLocation>>,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Debug)]
|
||||
pub struct FieldLocation {
|
||||
log_idx: LogIdx,
|
||||
property: Option<String>,
|
||||
}
|
||||
|
||||
|
||||
impl ReverseIndex {
|
||||
fn new() -> ReverseIndex {
|
||||
ReverseIndex {
|
||||
strings: HashMap::new(),
|
||||
integers: BTreeMap::new(),
|
||||
floats: BTreeMap::new(),
|
||||
booleans: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
56
src/log_parser.rs
Normal file
56
src/log_parser.rs
Normal file
@@ -0,0 +1,56 @@
|
||||
use crate::log_database::{Log, LogField, LogValue};
|
||||
|
||||
pub trait LogParser {
|
||||
fn parse_line(&self, line: &str) -> Result<Log, ParseError>;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ParseError {
|
||||
pub message: &'static str,
|
||||
pub col: u32,
|
||||
}
|
||||
|
||||
|
||||
// a Log consists of multiple keywords, which are indexed into the keyword database
|
||||
// each keyword has a data type, as well as text positional textual information for syntax highlighting
|
||||
// data can also be structured, so keywords need to know their position in the data structure for targetted queries
|
||||
|
||||
pub struct TextParser<'a> {
|
||||
pub separator: Option<&'a str>,
|
||||
}
|
||||
|
||||
/// Parses unstructured text word by word
|
||||
impl<'a> LogParser for TextParser<'a> {
|
||||
fn parse_line(&self, line: &str) -> Result<Log, ParseError> {
|
||||
let mut split_iter;
|
||||
let mut split_whitespace_iter;
|
||||
|
||||
let words: &mut dyn Iterator<Item = &str> = match self.separator {
|
||||
Some(separator) => {
|
||||
split_iter = line.split(separator);
|
||||
&mut split_iter
|
||||
},
|
||||
None => {
|
||||
split_whitespace_iter = line.split_whitespace();
|
||||
&mut split_whitespace_iter
|
||||
},
|
||||
};
|
||||
|
||||
let fields: Vec<LogField> = words
|
||||
.map(|word| LogField {
|
||||
property: None,
|
||||
value: LogValue::String(String::from(word))
|
||||
}).collect();
|
||||
|
||||
Ok(Log {fields})
|
||||
}
|
||||
}
|
||||
|
||||
// pub struct JsonParser {
|
||||
// }
|
||||
|
||||
// impl LogParser for JsonParser {
|
||||
// fn parse_line(&self, line: &str) -> Result<Log, ParseError> {
|
||||
// Ok(Log {})
|
||||
// }
|
||||
// }
|
||||
28
src/main.rs
Normal file
28
src/main.rs
Normal file
@@ -0,0 +1,28 @@
|
||||
// use std::io::{self, BufRead};
|
||||
// use atty::Stream;
|
||||
|
||||
use sift::log_database::LogDatabase;
|
||||
use sift::log_parser::TextParser;
|
||||
|
||||
fn main() {
|
||||
// if atty::is(Stream::Stdin) {
|
||||
// println!("no pipe");
|
||||
// return;
|
||||
// }
|
||||
|
||||
// let stdin = io::stdin();
|
||||
// for line in stdin.lock().lines() {
|
||||
// let line = line.expect("Failed to read line from stdin");
|
||||
// println!("{}", line);
|
||||
// }
|
||||
|
||||
let mut log_database = LogDatabase::new(TextParser { separator: None });
|
||||
log_database.ingest(String::from("hello world")).unwrap();
|
||||
log_database.ingest(String::from("have a good hello")).unwrap();
|
||||
log_database.ingest(String::from("goodbye world")).unwrap();
|
||||
|
||||
println!("hello {:?}", log_database.find_value_string("hello").collect::<Vec<_>>());
|
||||
println!("have {:?}", log_database.find_value_string("have").collect::<Vec<_>>());
|
||||
println!("world {:?}", log_database.find_value_string("world").collect::<Vec<_>>());
|
||||
println!("elliot {:?}", log_database.find_value_string("elliot").collect::<Vec<_>>());
|
||||
}
|
||||
Reference in New Issue
Block a user