diff --git a/Cargo.lock b/Cargo.lock index 16b8326..e71356f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -373,6 +373,12 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "endian-type" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "869b0adbda23651a9c5c0c3d270aac9fcb52e8622a8f2b17e57802d7791962f2" + [[package]] name = "equivalent" version = "1.0.2" @@ -617,6 +623,15 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "nibble_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" +dependencies = [ + "smallvec", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -715,6 +730,16 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "radix_trie" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b4431027dcd37fc2a73ef740b5f233aa805897935b8bce0195e41bbf9a3289a" +dependencies = [ + "endian-type", + "nibble_vec", +] + [[package]] name = "ratatui" version = "0.29.0" @@ -795,6 +820,48 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", + "serde_core", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -821,7 +888,9 @@ dependencies = [ "crossterm 0.29.0", "eyre", "ordered-float", + "radix_trie", "ratatui", + "serde_json", "tracing", ] diff --git a/Cargo.toml b/Cargo.toml index 623bb42..05d4a6d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,5 +18,7 @@ color-eyre = "0.6.5" crossterm = "0.29.0" eyre = "0.6.12" ordered-float = "5.0.0" +radix_trie = "0.3.0" ratatui = "0.29.0" +serde_json = "1.0.145" tracing = "0.1.41" diff --git a/src/log_database.rs b/src/log_database.rs index ad67364..bb4d3e4 100644 --- a/src/log_database.rs +++ b/src/log_database.rs @@ -1,12 +1,26 @@ -use std::collections::{HashMap, HashSet, BTreeMap}; +#![feature(btree_cursors)] + +use std::cmp::Ordering; +use std::collections::{HashMap, BTreeSet, BTreeMap}; +use std::ops::Bound; +use std::iter; use ordered_float::OrderedFloat; +use radix_trie::{Trie, TrieCommon}; -use crate::log_parser::{LogParser, ParseError}; +use crate::log_parser::{LogParser}; pub type LogIdx = usize; +#[derive(Debug)] +pub struct IngestError { + pub idx: LogIdx, + pub message: String, + pub column: u32, +} + pub enum LogValue { + Null, String(String), Integer(i64), Float(f64), @@ -33,24 +47,27 @@ pub struct LogDatabase { /// property names are always converted to strings /// nested properties are additionally indexed with each parent property prefixes, using slash notation /// Example: JSON { a: { b: c: 1 } } is indexed with the properties "c", "b/c", and "a/b/c" - property_index: HashMap>, + property_index: HashMap>, reverse_index: ReverseIndex, } impl LogDatabase { - pub fn new(parser: T) -> LogDatabase { - LogDatabase { + pub fn new(parser: T) -> Self { + Self { parser, - logs_raw: Vec::new(), property_index: HashMap::new(), reverse_index: ReverseIndex::new(), } } - pub fn ingest(&mut self, line: String) -> Result { - let log = self.parser.parse_line(&line)?; + pub fn ingest(&mut self, line: String) -> Result { let idx = self.logs_raw.len(); + let log = self.parser.parse_line(&line).map_err(|error| IngestError { + idx: idx, + message: error.message, + column: error.column, + })?; for field in log.fields.iter() { self.index_field(idx, &field.property, &field.value); } @@ -68,7 +85,7 @@ impl LogDatabase { idxs.insert(idx); } else { - let mut idxs = HashSet::new(); + let mut idxs = BTreeSet::new(); idxs.insert(idx); self.property_index.insert(property.clone(), idxs); } @@ -77,6 +94,9 @@ impl LogDatabase { // add to reverse index let field_location = FieldLocation { log_idx: idx, property: property.clone() }; match value { + LogValue::Null => { + self.reverse_index.nulls.insert(field_location); + }, LogValue::String(value) => { let entry = self.reverse_index.strings.get_mut(value); @@ -84,7 +104,7 @@ impl LogDatabase { field_locations.insert(field_location); } else { - let mut field_locations = HashSet::new(); + let mut field_locations = BTreeSet::new(); field_locations.insert(field_location); self.reverse_index.strings.insert(value.clone(), field_locations); } @@ -95,21 +115,32 @@ impl LogDatabase { field_locations.insert(field_location); } else { - let mut field_locations = HashSet::new(); + let mut field_locations = BTreeSet::new(); field_locations.insert(field_location); self.reverse_index.integers.insert(*value, field_locations); } }, LogValue::Float(value) => { let ordered = OrderedFloat(*value); - let entry = self.reverse_index.floats.get_mut(&ordered); - if let Some(field_locations) = entry { + if let Some(field_locations) = self.reverse_index.floats.get_mut(&ordered) { + field_locations.insert(field_location.clone()); + } + else { + let mut field_locations = BTreeSet::new(); + field_locations.insert(field_location.clone()); + self.reverse_index.floats.insert(ordered, field_locations); + + } + + // add to mantissa index + let bits = Bitsf64::from(*value); + if let Some(field_locations) = self.reverse_index.floats_mantissa.get_mut(&bits.mantissa) { field_locations.insert(field_location); } else { - let mut field_locations = HashSet::new(); + let mut field_locations = BTreeSet::new(); field_locations.insert(field_location); - self.reverse_index.floats.insert(ordered, field_locations); + self.reverse_index.floats_mantissa.insert(bits.mantissa, field_locations); } }, LogValue::Boolean(value) => { @@ -118,7 +149,7 @@ impl LogDatabase { field_locations.insert(field_location); } else { - let mut field_locations = HashSet::new(); + let mut field_locations = BTreeSet::new(); field_locations.insert(field_location); self.reverse_index.booleans.insert(*value, field_locations); } @@ -138,33 +169,116 @@ impl LogDatabase { } } - // TODO: implemnt as generic for LogValue types?? - pub fn find_value_string(&self, value: &str) -> impl Iterator { + pub fn find_value_str(&self, value: &str) -> impl Iterator { self.reverse_index.strings.get(value).into_iter().flatten() } + + pub fn find_matching_str(&self, value: &str) -> impl Iterator { + // prefix matching + self.reverse_index.strings.subtrie(value).into_iter() + .flat_map(|subtrie| { subtrie.iter() }) + .flat_map(|(_, locations)| { + locations.iter() + }) + } + + pub fn find_value_i64(&self, value: i64) -> impl Iterator { + self.reverse_index.integers.get(&value).into_iter().flatten() + } + + // 1, 1.2, 1.5, 1.8, 2, 2.5, 12, 125 + + // 1 + // 1, 1.2, 1.5, 1.8, 12, 125 + + // 2 + // 2, 2.5 + + // pub fn find_value_f64_mantissa(&self, value: f64) -> impl Iterator { + // let bits = Bitsf64::from(value); + // let base10_digits = bits.mantissa + // let range = self.reverse_index.floats_mantissa.range((Bound::Included(bits.mantissa), Bound::Excluded(bits.mantissa + ))); + // cursor.iter().flatten() + // } + + pub fn find_value_bool(&self, value: bool) -> impl Iterator { + self.reverse_index.booleans.get(&value).into_iter().flatten() + } + pub fn find_value_null(&self) -> impl Iterator { + self.reverse_index.nulls.iter() + } + } struct ReverseIndex { - strings: HashMap>, - integers: BTreeMap>, - floats: BTreeMap, HashSet>, - booleans: HashMap>, + strings: Trie>, + integers: BTreeMap>, + floats: BTreeMap, BTreeSet>, + floats_mantissa: BTreeMap>, + booleans: HashMap>, + nulls: BTreeSet, } -#[derive(PartialEq, Eq, Hash, Debug)] +#[derive(PartialEq, Eq, Hash, PartialOrd, Ord, Clone, Debug)] pub struct FieldLocation { - log_idx: LogIdx, - property: Option, + pub log_idx: LogIdx, + pub property: Option, } - impl ReverseIndex { - fn new() -> ReverseIndex { - ReverseIndex { - strings: HashMap::new(), + fn new() -> Self { + Self { + strings: Trie::new(), integers: BTreeMap::new(), floats: BTreeMap::new(), + floats_mantissa: BTreeMap::new(), booleans: HashMap::new(), + nulls: BTreeSet::new(), } } } + +// float representations + +const F64_EXPONENT_BITS: u32 = 11; +const F64_MANTISSA_BITS: u32 = 52; + +struct Bitsf64 { + pub sign: bool, + pub exponent: u32, + pub mantissa: u64, +} + +impl Bitsf64 { + fn from(f: f64) -> Self { + let bits = f.to_bits(); + Self { + sign: ((bits >> 63) & 1) == 1, + exponent: ((bits >> F64_MANTISSA_BITS) & ((1< Ordering { + (self.0.mantissa, self.0.exponent, self.0.sign).cmp(&(other.0.mantissa, other.0.exponent, other.0.sign)) + } +} + +impl PartialOrd for f64MantissaOrd { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for f64MantissaOrd { + fn eq(&self, other: &Self) -> bool { + (self.0.mantissa, self.0.exponent, self.0.sign) == (other.0.mantissa, other.0.exponent, other.0.sign) + } +} + +impl Eq for f64MantissaOrd { } + diff --git a/src/log_parser.rs b/src/log_parser.rs index 1fb0746..fab9654 100644 --- a/src/log_parser.rs +++ b/src/log_parser.rs @@ -1,56 +1,21 @@ -use crate::log_database::{Log, LogField, LogValue}; +mod text_parser; +pub use text_parser::TextParser; +mod json_parser; +pub use json_parser::JsonParser; + +use crate::log_database::{Log}; + +// Parses input into a string format +/// a Log consists of multiple keywords, which are indexed into the keyword database +/// each keyword has a data type, as well as text positional textual information for syntax highlighting +/// data can also be structured, so keywords need to know their position in the data structure for targetted queries pub trait LogParser { fn parse_line(&self, line: &str) -> Result; } #[derive(Debug)] pub struct ParseError { - pub message: &'static str, - pub col: u32, + pub message: String, + pub column: u32, } - - -// a Log consists of multiple keywords, which are indexed into the keyword database -// each keyword has a data type, as well as text positional textual information for syntax highlighting -// data can also be structured, so keywords need to know their position in the data structure for targetted queries - -pub struct TextParser<'a> { - pub separator: Option<&'a str>, -} - -/// Parses unstructured text word by word -impl<'a> LogParser for TextParser<'a> { - fn parse_line(&self, line: &str) -> Result { - let mut split_iter; - let mut split_whitespace_iter; - - let words: &mut dyn Iterator = match self.separator { - Some(separator) => { - split_iter = line.split(separator); - &mut split_iter - }, - None => { - split_whitespace_iter = line.split_whitespace(); - &mut split_whitespace_iter - }, - }; - - let fields: Vec = words - .map(|word| LogField { - property: None, - value: LogValue::String(String::from(word)) - }).collect(); - - Ok(Log {fields}) - } -} - -// pub struct JsonParser { -// } - -// impl LogParser for JsonParser { -// fn parse_line(&self, line: &str) -> Result { -// Ok(Log {}) -// } -// } diff --git a/src/log_parser/json_parser.rs b/src/log_parser/json_parser.rs new file mode 100644 index 0000000..a099ac0 --- /dev/null +++ b/src/log_parser/json_parser.rs @@ -0,0 +1,80 @@ +use std::collections::HashMap; + +use serde_json; + +use super::{LogParser, ParseError}; +use crate::log_database::{Log, LogField, LogValue}; + +pub struct JsonParser { +} + +/// Parses single-line JSON +impl LogParser for JsonParser { + fn parse_line(&self, line: &str) -> Result { + let json_value: serde_json::Value = match serde_json::from_str(line) { + Ok(value) => value, + Err(error) => return Err(ParseError { + message: error.to_string(), + column: u32::try_from(error.column()).expect("column to fit in u32"), + }), + }; + + let log_value = serde_value_to_log_value(json_value)?; + let fields: Vec = match log_value { + LogValue::Null + | LogValue::String(_) + | LogValue::Integer(_) + | LogValue::Float(_) + | LogValue::Boolean(_) => vec!(LogField { + property: None, + value: log_value, + }), + LogValue::Array(array) => array.into_iter().map(|v| LogField { + property: None, + value: v, + }).collect(), + LogValue::Map(map) => map.into_iter().map(|(k, v)| LogField { + property: Some(k), + value: v, + }).collect(), + }; + + Ok(Log {fields}) + } +} + +fn serde_value_to_log_value(json_value: serde_json::Value) -> Result { + let value = match json_value { + serde_json::Value::String(value) => LogValue::String(value), + serde_json::Value::Null => LogValue::Null, + serde_json::Value::Number(value) => { + if let Some(as_int) = value.as_i64() { LogValue::Integer(as_int) } + else if let Some(as_float) = value.as_f64() { LogValue::Float(as_float) } + else { return Err(ParseError { + message: format!("Failed to parse number: {}", value.to_string()), + column: 0, + // TODO: update with serde_json spanned values + // https://github.com/serde-rs/json/issues/637 + }) } + }, + serde_json::Value::Bool(value) => LogValue::Boolean(value), + serde_json::Value::Array(values) => { + let mut array: Box> = Box::new(Vec::new()); + for value in values { + array.push(serde_value_to_log_value(value)?); + } + + LogValue::Array(array) + }, + serde_json::Value::Object(properties) => { + let mut map: Box> = Box::new(HashMap::new()); + for (key, value) in properties { + map.insert(key, serde_value_to_log_value(value)?); + } + + LogValue::Map(map) + }, + }; + + return Ok(value); +} diff --git a/src/log_parser/text_parser.rs b/src/log_parser/text_parser.rs new file mode 100644 index 0000000..25c4541 --- /dev/null +++ b/src/log_parser/text_parser.rs @@ -0,0 +1,33 @@ +use super::{LogParser, ParseError}; +use crate::log_database::{Log, LogField, LogValue}; + +pub struct TextParser<'a> { + pub separator: Option<&'a str>, +} + +/// Parses unstructured text word by word +impl LogParser for TextParser<'_> { + fn parse_line(&self, line: &str) -> Result { + let mut split_iter; + let mut split_whitespace_iter; + + let words: &mut dyn Iterator = match self.separator { + Some(separator) => { + split_iter = line.split(separator); + &mut split_iter + }, + None => { + split_whitespace_iter = line.split_whitespace(); + &mut split_whitespace_iter + }, + }; + + let fields: Vec = words + .map(|word| LogField { + property: None, + value: LogValue::String(String::from(word)) + }).collect(); + + Ok(Log {fields}) + } +} diff --git a/src/main.rs b/src/main.rs index e57a905..33ccd1f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,7 @@ // use atty::Stream; use sift::log_database::LogDatabase; -use sift::log_parser::TextParser; +use sift::log_parser::{TextParser, JsonParser}; fn main() { // if atty::is(Stream::Stdin) { @@ -16,13 +16,28 @@ fn main() { // println!("{}", line); // } - let mut log_database = LogDatabase::new(TextParser { separator: None }); - log_database.ingest(String::from("hello world")).unwrap(); - log_database.ingest(String::from("have a good hello")).unwrap(); - log_database.ingest(String::from("goodbye world")).unwrap(); + // let mut log_database = LogDatabase::new(TextParser { separator: None }); + // log_database.ingest(String::from("hello world")).unwrap(); + // log_database.ingest(String::from("have a good hello")).unwrap(); + // log_database.ingest(String::from("goodbye world")).unwrap(); - println!("hello {:?}", log_database.find_value_string("hello").collect::>()); - println!("have {:?}", log_database.find_value_string("have").collect::>()); - println!("world {:?}", log_database.find_value_string("world").collect::>()); - println!("elliot {:?}", log_database.find_value_string("elliot").collect::>()); + // println!("hello {:?}", log_database.find_value_string("hello").collect::>()); + // println!("have {:?}", log_database.find_value_string("have").collect::>()); + // println!("world {:?}", log_database.find_value_string("world").collect::>()); + // println!("elliot {:?}", log_database.find_value_string("elliot").collect::>()); + + let mut log_database = LogDatabase::new(JsonParser {}); + log_database.ingest(String::from("null")).unwrap(); + log_database.ingest(String::from("\"hello\"")).unwrap(); + log_database.ingest(String::from("42")).unwrap(); + log_database.ingest(String::from("3.14")).unwrap(); + log_database.ingest(String::from("true")).unwrap(); + log_database.ingest(String::from("{\"greeting\": \"hello\", \"name\": \"world\"}")).unwrap(); + log_database.ingest(String::from("{\"greeting\": \"hello\", \"name\": \"elliot\"}")).unwrap(); + + println!("hello {:?}", log_database.find_value_str("hello").collect::>()); + println!("world {:?}", log_database.find_value_str("world").collect::>()); + + println!("matching"); + println!("hello {:?}", log_database.find_matching_str("hello").collect::>()); }