Initial commit, basic indexing and plain text parsing
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
/target
|
||||||
1315
Cargo.lock
generated
Normal file
1315
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
15
Cargo.toml
Normal file
15
Cargo.toml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
[package]
|
||||||
|
name = "sift"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
atty = "0.2.14"
|
||||||
|
chrono = "0.4.42"
|
||||||
|
clap = "4.5.48"
|
||||||
|
color-eyre = "0.6.5"
|
||||||
|
crossterm = "0.29.0"
|
||||||
|
eyre = "0.6.12"
|
||||||
|
ordered-float = "5.0.0"
|
||||||
|
ratatui = "0.29.0"
|
||||||
|
tracing = "0.1.41"
|
||||||
128
bacon.toml
Normal file
128
bacon.toml
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
# This is a configuration file for the bacon tool
|
||||||
|
#
|
||||||
|
# Complete help on configuration: https://dystroy.org/bacon/config/
|
||||||
|
#
|
||||||
|
# You may check the current default at
|
||||||
|
# https://github.com/Canop/bacon/blob/main/defaults/default-bacon.toml
|
||||||
|
|
||||||
|
default_job = "check"
|
||||||
|
env.CARGO_TERM_COLOR = "always"
|
||||||
|
|
||||||
|
[jobs.check]
|
||||||
|
command = ["cargo", "check"]
|
||||||
|
need_stdout = false
|
||||||
|
|
||||||
|
[jobs.check-all]
|
||||||
|
command = ["cargo", "check", "--all-targets"]
|
||||||
|
need_stdout = false
|
||||||
|
|
||||||
|
# Run clippy on the default target
|
||||||
|
[jobs.clippy]
|
||||||
|
command = [
|
||||||
|
"cargo", "clippy",
|
||||||
|
"--",
|
||||||
|
"-W", "clippy::pedantic",
|
||||||
|
"-W", "clippy::nursery",
|
||||||
|
"-W", "clippy::unwrap_used",
|
||||||
|
"-W", "clippy::expect_used"
|
||||||
|
]
|
||||||
|
need_stdout = false
|
||||||
|
|
||||||
|
# Run clippy on all targets
|
||||||
|
# To disable some lints, you may change the job this way:
|
||||||
|
# [jobs.clippy-all]
|
||||||
|
# command = [
|
||||||
|
# "cargo", "clippy",
|
||||||
|
# "--all-targets",
|
||||||
|
# "--",
|
||||||
|
# "-A", "clippy::bool_to_int_with_if",
|
||||||
|
# "-A", "clippy::collapsible_if",
|
||||||
|
# "-A", "clippy::derive_partial_eq_without_eq",
|
||||||
|
# ]
|
||||||
|
# need_stdout = false
|
||||||
|
[jobs.clippy-all]
|
||||||
|
command = [
|
||||||
|
"cargo", "clippy", "--all-targets",
|
||||||
|
"--",
|
||||||
|
"-W", "clippy::pedantic",
|
||||||
|
"-W", "clippy::nursery",
|
||||||
|
"-W", "clippy::unwrap_used",
|
||||||
|
"-W", "clippy::expect_used"
|
||||||
|
]
|
||||||
|
need_stdout = false
|
||||||
|
|
||||||
|
# This job lets you run
|
||||||
|
# - all tests: bacon test
|
||||||
|
# - a specific test: bacon test -- config::test_default_files
|
||||||
|
# - the tests of a package: bacon test -- -- -p config
|
||||||
|
[jobs.test]
|
||||||
|
command = ["cargo", "test"]
|
||||||
|
need_stdout = true
|
||||||
|
|
||||||
|
[jobs.nextest]
|
||||||
|
command = [
|
||||||
|
"cargo", "nextest", "run",
|
||||||
|
"--hide-progress-bar", "--failure-output", "final"
|
||||||
|
]
|
||||||
|
need_stdout = true
|
||||||
|
analyzer = "nextest"
|
||||||
|
|
||||||
|
[jobs.doc]
|
||||||
|
command = ["cargo", "doc", "--no-deps"]
|
||||||
|
need_stdout = false
|
||||||
|
|
||||||
|
# If the doc compiles, then it opens in your browser and bacon switches
|
||||||
|
# to the previous job
|
||||||
|
[jobs.doc-open]
|
||||||
|
command = ["cargo", "doc", "--no-deps", "--open"]
|
||||||
|
need_stdout = false
|
||||||
|
on_success = "back" # so that we don't open the browser at each change
|
||||||
|
|
||||||
|
# You can run your application and have the result displayed in bacon,
|
||||||
|
# if it makes sense for this crate.
|
||||||
|
[jobs.run]
|
||||||
|
command = [
|
||||||
|
"cargo", "run",
|
||||||
|
# put launch parameters for your program behind a `--` separator
|
||||||
|
]
|
||||||
|
need_stdout = true
|
||||||
|
allow_warnings = true
|
||||||
|
background = true
|
||||||
|
|
||||||
|
# Run your long-running application (eg server) and have the result displayed in bacon.
|
||||||
|
# For programs that never stop (eg a server), `background` is set to false
|
||||||
|
# to have the cargo run output immediately displayed instead of waiting for
|
||||||
|
# program's end.
|
||||||
|
# 'on_change_strategy' is set to `kill_then_restart` to have your program restart
|
||||||
|
# on every change (an alternative would be to use the 'F5' key manually in bacon).
|
||||||
|
# If you often use this job, it makes sense to override the 'r' key by adding
|
||||||
|
# a binding `r = job:run-long` at the end of this file .
|
||||||
|
# A custom kill command such as the one suggested below is frequently needed to kill
|
||||||
|
# long running programs (uncomment it if you need it)
|
||||||
|
[jobs.run-long]
|
||||||
|
command = [
|
||||||
|
"cargo", "run",
|
||||||
|
# put launch parameters for your program behind a `--` separator
|
||||||
|
]
|
||||||
|
need_stdout = true
|
||||||
|
allow_warnings = true
|
||||||
|
background = false
|
||||||
|
on_change_strategy = "kill_then_restart"
|
||||||
|
# kill = ["pkill", "-TERM", "-P"]
|
||||||
|
|
||||||
|
# This parameterized job runs the example of your choice, as soon
|
||||||
|
# as the code compiles.
|
||||||
|
# Call it as
|
||||||
|
# bacon ex -- my-example
|
||||||
|
[jobs.ex]
|
||||||
|
command = ["cargo", "run", "--example"]
|
||||||
|
need_stdout = true
|
||||||
|
allow_warnings = true
|
||||||
|
|
||||||
|
# You may define here keybindings that would be specific to
|
||||||
|
# a project, for example a shortcut to launch a specific job.
|
||||||
|
# Shortcuts to internal functions (scrolling, toggling, etc.)
|
||||||
|
# should go in your personal global prefs.toml file instead.
|
||||||
|
[keybindings]
|
||||||
|
# alt-m = "job:my-job"
|
||||||
|
c = "job:clippy-all" # comment this to have 'c' run clippy on only the default target
|
||||||
2
src/lib.rs
Normal file
2
src/lib.rs
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
pub mod log_parser;
|
||||||
|
pub mod log_database;
|
||||||
170
src/log_database.rs
Normal file
170
src/log_database.rs
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
use std::collections::{HashMap, HashSet, BTreeMap};
|
||||||
|
|
||||||
|
use ordered_float::OrderedFloat;
|
||||||
|
|
||||||
|
use crate::log_parser::{LogParser, ParseError};
|
||||||
|
|
||||||
|
pub type LogIdx = usize;
|
||||||
|
|
||||||
|
pub enum LogValue {
|
||||||
|
String(String),
|
||||||
|
Integer(i64),
|
||||||
|
Float(f64),
|
||||||
|
Boolean(bool),
|
||||||
|
Array(Box<Vec<LogValue>>),
|
||||||
|
Map(Box<HashMap<String, LogValue>>),
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct LogField {
|
||||||
|
// array indexes and number fields are converted to strings
|
||||||
|
pub property: Option<String>,
|
||||||
|
pub value: LogValue,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Log {
|
||||||
|
pub fields: Vec<LogField>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct LogDatabase<T: LogParser> {
|
||||||
|
parser: T,
|
||||||
|
|
||||||
|
logs_raw: Vec<String>,
|
||||||
|
/// maps property names to the set of all structured logs that have the property
|
||||||
|
/// property names are always converted to strings
|
||||||
|
/// nested properties are additionally indexed with each parent property prefixes, using slash notation
|
||||||
|
/// Example: JSON { a: { b: c: 1 } } is indexed with the properties "c", "b/c", and "a/b/c"
|
||||||
|
property_index: HashMap<String, HashSet<LogIdx>>,
|
||||||
|
reverse_index: ReverseIndex,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: LogParser> LogDatabase<T> {
|
||||||
|
pub fn new(parser: T) -> LogDatabase<T> {
|
||||||
|
LogDatabase {
|
||||||
|
parser,
|
||||||
|
|
||||||
|
logs_raw: Vec::new(),
|
||||||
|
property_index: HashMap::new(),
|
||||||
|
reverse_index: ReverseIndex::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn ingest(&mut self, line: String) -> Result<LogIdx, ParseError> {
|
||||||
|
let log = self.parser.parse_line(&line)?;
|
||||||
|
let idx = self.logs_raw.len();
|
||||||
|
for field in log.fields.iter() {
|
||||||
|
self.index_field(idx, &field.property, &field.value);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.logs_raw.push(line);
|
||||||
|
|
||||||
|
Ok(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn index_field(&mut self, idx: LogIdx, property: &Option<String>, value: &LogValue) {
|
||||||
|
if let Some(property) = property {
|
||||||
|
// add to property index
|
||||||
|
let entry = self.property_index.get_mut(property);
|
||||||
|
if let Some(idxs) = entry {
|
||||||
|
idxs.insert(idx);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
let mut idxs = HashSet::new();
|
||||||
|
idxs.insert(idx);
|
||||||
|
self.property_index.insert(property.clone(), idxs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// add to reverse index
|
||||||
|
let field_location = FieldLocation { log_idx: idx, property: property.clone() };
|
||||||
|
match value {
|
||||||
|
LogValue::String(value) => {
|
||||||
|
let entry = self.reverse_index.strings.get_mut(value);
|
||||||
|
|
||||||
|
if let Some(field_locations) = entry {
|
||||||
|
field_locations.insert(field_location);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
let mut field_locations = HashSet::new();
|
||||||
|
field_locations.insert(field_location);
|
||||||
|
self.reverse_index.strings.insert(value.clone(), field_locations);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LogValue::Integer(value) => {
|
||||||
|
let entry = self.reverse_index.integers.get_mut(value);
|
||||||
|
if let Some(field_locations) = entry {
|
||||||
|
field_locations.insert(field_location);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
let mut field_locations = HashSet::new();
|
||||||
|
field_locations.insert(field_location);
|
||||||
|
self.reverse_index.integers.insert(*value, field_locations);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LogValue::Float(value) => {
|
||||||
|
let ordered = OrderedFloat(*value);
|
||||||
|
let entry = self.reverse_index.floats.get_mut(&ordered);
|
||||||
|
if let Some(field_locations) = entry {
|
||||||
|
field_locations.insert(field_location);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
let mut field_locations = HashSet::new();
|
||||||
|
field_locations.insert(field_location);
|
||||||
|
self.reverse_index.floats.insert(ordered, field_locations);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LogValue::Boolean(value) => {
|
||||||
|
let entry = self.reverse_index.booleans.get_mut(value);
|
||||||
|
if let Some(field_locations) = entry {
|
||||||
|
field_locations.insert(field_location);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
let mut field_locations = HashSet::new();
|
||||||
|
field_locations.insert(field_location);
|
||||||
|
self.reverse_index.booleans.insert(*value, field_locations);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LogValue::Array(value) => {
|
||||||
|
for (index, value) in value.iter().enumerate() {
|
||||||
|
let property = Some(index.to_string());
|
||||||
|
self.index_field(idx, &property, value);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LogValue::Map(value) => {
|
||||||
|
for (key, value) in value.iter() {
|
||||||
|
let property = Some(key.clone());
|
||||||
|
self.index_field(idx, &property, value);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: implemnt as generic for LogValue types??
|
||||||
|
pub fn find_value_string(&self, value: &str) -> impl Iterator<Item = &FieldLocation> {
|
||||||
|
self.reverse_index.strings.get(value).into_iter().flatten()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ReverseIndex {
|
||||||
|
strings: HashMap<String, HashSet<FieldLocation>>,
|
||||||
|
integers: BTreeMap<i64, HashSet<FieldLocation>>,
|
||||||
|
floats: BTreeMap<OrderedFloat<f64>, HashSet<FieldLocation>>,
|
||||||
|
booleans: HashMap<bool, HashSet<FieldLocation>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(PartialEq, Eq, Hash, Debug)]
|
||||||
|
pub struct FieldLocation {
|
||||||
|
log_idx: LogIdx,
|
||||||
|
property: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
impl ReverseIndex {
|
||||||
|
fn new() -> ReverseIndex {
|
||||||
|
ReverseIndex {
|
||||||
|
strings: HashMap::new(),
|
||||||
|
integers: BTreeMap::new(),
|
||||||
|
floats: BTreeMap::new(),
|
||||||
|
booleans: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
56
src/log_parser.rs
Normal file
56
src/log_parser.rs
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
use crate::log_database::{Log, LogField, LogValue};
|
||||||
|
|
||||||
|
pub trait LogParser {
|
||||||
|
fn parse_line(&self, line: &str) -> Result<Log, ParseError>;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct ParseError {
|
||||||
|
pub message: &'static str,
|
||||||
|
pub col: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// a Log consists of multiple keywords, which are indexed into the keyword database
|
||||||
|
// each keyword has a data type, as well as text positional textual information for syntax highlighting
|
||||||
|
// data can also be structured, so keywords need to know their position in the data structure for targetted queries
|
||||||
|
|
||||||
|
pub struct TextParser<'a> {
|
||||||
|
pub separator: Option<&'a str>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses unstructured text word by word
|
||||||
|
impl<'a> LogParser for TextParser<'a> {
|
||||||
|
fn parse_line(&self, line: &str) -> Result<Log, ParseError> {
|
||||||
|
let mut split_iter;
|
||||||
|
let mut split_whitespace_iter;
|
||||||
|
|
||||||
|
let words: &mut dyn Iterator<Item = &str> = match self.separator {
|
||||||
|
Some(separator) => {
|
||||||
|
split_iter = line.split(separator);
|
||||||
|
&mut split_iter
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
split_whitespace_iter = line.split_whitespace();
|
||||||
|
&mut split_whitespace_iter
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
let fields: Vec<LogField> = words
|
||||||
|
.map(|word| LogField {
|
||||||
|
property: None,
|
||||||
|
value: LogValue::String(String::from(word))
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
Ok(Log {fields})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// pub struct JsonParser {
|
||||||
|
// }
|
||||||
|
|
||||||
|
// impl LogParser for JsonParser {
|
||||||
|
// fn parse_line(&self, line: &str) -> Result<Log, ParseError> {
|
||||||
|
// Ok(Log {})
|
||||||
|
// }
|
||||||
|
// }
|
||||||
28
src/main.rs
Normal file
28
src/main.rs
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
// use std::io::{self, BufRead};
|
||||||
|
// use atty::Stream;
|
||||||
|
|
||||||
|
use sift::log_database::LogDatabase;
|
||||||
|
use sift::log_parser::TextParser;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
// if atty::is(Stream::Stdin) {
|
||||||
|
// println!("no pipe");
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// let stdin = io::stdin();
|
||||||
|
// for line in stdin.lock().lines() {
|
||||||
|
// let line = line.expect("Failed to read line from stdin");
|
||||||
|
// println!("{}", line);
|
||||||
|
// }
|
||||||
|
|
||||||
|
let mut log_database = LogDatabase::new(TextParser { separator: None });
|
||||||
|
log_database.ingest(String::from("hello world")).unwrap();
|
||||||
|
log_database.ingest(String::from("have a good hello")).unwrap();
|
||||||
|
log_database.ingest(String::from("goodbye world")).unwrap();
|
||||||
|
|
||||||
|
println!("hello {:?}", log_database.find_value_string("hello").collect::<Vec<_>>());
|
||||||
|
println!("have {:?}", log_database.find_value_string("have").collect::<Vec<_>>());
|
||||||
|
println!("world {:?}", log_database.find_value_string("world").collect::<Vec<_>>());
|
||||||
|
println!("elliot {:?}", log_database.find_value_string("elliot").collect::<Vec<_>>());
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user