Add JSON parser, use radix trie for string matching

This commit is contained in:
2025-11-22 14:59:37 -07:00
parent edc83330cc
commit 36d0d8fa4d
7 changed files with 364 additions and 86 deletions

69
Cargo.lock generated
View File

@@ -373,6 +373,12 @@ version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "endian-type"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "869b0adbda23651a9c5c0c3d270aac9fcb52e8622a8f2b17e57802d7791962f2"
[[package]] [[package]]
name = "equivalent" name = "equivalent"
version = "1.0.2" version = "1.0.2"
@@ -617,6 +623,15 @@ dependencies = [
"windows-sys 0.59.0", "windows-sys 0.59.0",
] ]
[[package]]
name = "nibble_vec"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43"
dependencies = [
"smallvec",
]
[[package]] [[package]]
name = "num-traits" name = "num-traits"
version = "0.2.19" version = "0.2.19"
@@ -715,6 +730,16 @@ dependencies = [
"proc-macro2", "proc-macro2",
] ]
[[package]]
name = "radix_trie"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b4431027dcd37fc2a73ef740b5f233aa805897935b8bce0195e41bbf9a3289a"
dependencies = [
"endian-type",
"nibble_vec",
]
[[package]] [[package]]
name = "ratatui" name = "ratatui"
version = "0.29.0" version = "0.29.0"
@@ -795,6 +820,48 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
"serde_core",
]
[[package]] [[package]]
name = "sharded-slab" name = "sharded-slab"
version = "0.1.7" version = "0.1.7"
@@ -821,7 +888,9 @@ dependencies = [
"crossterm 0.29.0", "crossterm 0.29.0",
"eyre", "eyre",
"ordered-float", "ordered-float",
"radix_trie",
"ratatui", "ratatui",
"serde_json",
"tracing", "tracing",
] ]

View File

@@ -18,5 +18,7 @@ color-eyre = "0.6.5"
crossterm = "0.29.0" crossterm = "0.29.0"
eyre = "0.6.12" eyre = "0.6.12"
ordered-float = "5.0.0" ordered-float = "5.0.0"
radix_trie = "0.3.0"
ratatui = "0.29.0" ratatui = "0.29.0"
serde_json = "1.0.145"
tracing = "0.1.41" tracing = "0.1.41"

View File

@@ -1,12 +1,26 @@
use std::collections::{HashMap, HashSet, BTreeMap}; #![feature(btree_cursors)]
use std::cmp::Ordering;
use std::collections::{HashMap, BTreeSet, BTreeMap};
use std::ops::Bound;
use std::iter;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use radix_trie::{Trie, TrieCommon};
use crate::log_parser::{LogParser, ParseError}; use crate::log_parser::{LogParser};
pub type LogIdx = usize; pub type LogIdx = usize;
#[derive(Debug)]
pub struct IngestError {
pub idx: LogIdx,
pub message: String,
pub column: u32,
}
pub enum LogValue { pub enum LogValue {
Null,
String(String), String(String),
Integer(i64), Integer(i64),
Float(f64), Float(f64),
@@ -33,24 +47,27 @@ pub struct LogDatabase<T: LogParser> {
/// property names are always converted to strings /// property names are always converted to strings
/// nested properties are additionally indexed with each parent property prefixes, using slash notation /// nested properties are additionally indexed with each parent property prefixes, using slash notation
/// Example: JSON { a: { b: c: 1 } } is indexed with the properties "c", "b/c", and "a/b/c" /// Example: JSON { a: { b: c: 1 } } is indexed with the properties "c", "b/c", and "a/b/c"
property_index: HashMap<String, HashSet<LogIdx>>, property_index: HashMap<String, BTreeSet<LogIdx>>,
reverse_index: ReverseIndex, reverse_index: ReverseIndex,
} }
impl<T: LogParser> LogDatabase<T> { impl<T: LogParser> LogDatabase<T> {
pub fn new(parser: T) -> LogDatabase<T> { pub fn new(parser: T) -> Self {
LogDatabase { Self {
parser, parser,
logs_raw: Vec::new(), logs_raw: Vec::new(),
property_index: HashMap::new(), property_index: HashMap::new(),
reverse_index: ReverseIndex::new(), reverse_index: ReverseIndex::new(),
} }
} }
pub fn ingest(&mut self, line: String) -> Result<LogIdx, ParseError> { pub fn ingest(&mut self, line: String) -> Result<LogIdx, IngestError> {
let log = self.parser.parse_line(&line)?;
let idx = self.logs_raw.len(); let idx = self.logs_raw.len();
let log = self.parser.parse_line(&line).map_err(|error| IngestError {
idx: idx,
message: error.message,
column: error.column,
})?;
for field in log.fields.iter() { for field in log.fields.iter() {
self.index_field(idx, &field.property, &field.value); self.index_field(idx, &field.property, &field.value);
} }
@@ -68,7 +85,7 @@ impl<T: LogParser> LogDatabase<T> {
idxs.insert(idx); idxs.insert(idx);
} }
else { else {
let mut idxs = HashSet::new(); let mut idxs = BTreeSet::new();
idxs.insert(idx); idxs.insert(idx);
self.property_index.insert(property.clone(), idxs); self.property_index.insert(property.clone(), idxs);
} }
@@ -77,6 +94,9 @@ impl<T: LogParser> LogDatabase<T> {
// add to reverse index // add to reverse index
let field_location = FieldLocation { log_idx: idx, property: property.clone() }; let field_location = FieldLocation { log_idx: idx, property: property.clone() };
match value { match value {
LogValue::Null => {
self.reverse_index.nulls.insert(field_location);
},
LogValue::String(value) => { LogValue::String(value) => {
let entry = self.reverse_index.strings.get_mut(value); let entry = self.reverse_index.strings.get_mut(value);
@@ -84,7 +104,7 @@ impl<T: LogParser> LogDatabase<T> {
field_locations.insert(field_location); field_locations.insert(field_location);
} }
else { else {
let mut field_locations = HashSet::new(); let mut field_locations = BTreeSet::new();
field_locations.insert(field_location); field_locations.insert(field_location);
self.reverse_index.strings.insert(value.clone(), field_locations); self.reverse_index.strings.insert(value.clone(), field_locations);
} }
@@ -95,21 +115,32 @@ impl<T: LogParser> LogDatabase<T> {
field_locations.insert(field_location); field_locations.insert(field_location);
} }
else { else {
let mut field_locations = HashSet::new(); let mut field_locations = BTreeSet::new();
field_locations.insert(field_location); field_locations.insert(field_location);
self.reverse_index.integers.insert(*value, field_locations); self.reverse_index.integers.insert(*value, field_locations);
} }
}, },
LogValue::Float(value) => { LogValue::Float(value) => {
let ordered = OrderedFloat(*value); let ordered = OrderedFloat(*value);
let entry = self.reverse_index.floats.get_mut(&ordered); if let Some(field_locations) = self.reverse_index.floats.get_mut(&ordered) {
if let Some(field_locations) = entry { field_locations.insert(field_location.clone());
}
else {
let mut field_locations = BTreeSet::new();
field_locations.insert(field_location.clone());
self.reverse_index.floats.insert(ordered, field_locations);
}
// add to mantissa index
let bits = Bitsf64::from(*value);
if let Some(field_locations) = self.reverse_index.floats_mantissa.get_mut(&bits.mantissa) {
field_locations.insert(field_location); field_locations.insert(field_location);
} }
else { else {
let mut field_locations = HashSet::new(); let mut field_locations = BTreeSet::new();
field_locations.insert(field_location); field_locations.insert(field_location);
self.reverse_index.floats.insert(ordered, field_locations); self.reverse_index.floats_mantissa.insert(bits.mantissa, field_locations);
} }
}, },
LogValue::Boolean(value) => { LogValue::Boolean(value) => {
@@ -118,7 +149,7 @@ impl<T: LogParser> LogDatabase<T> {
field_locations.insert(field_location); field_locations.insert(field_location);
} }
else { else {
let mut field_locations = HashSet::new(); let mut field_locations = BTreeSet::new();
field_locations.insert(field_location); field_locations.insert(field_location);
self.reverse_index.booleans.insert(*value, field_locations); self.reverse_index.booleans.insert(*value, field_locations);
} }
@@ -138,33 +169,116 @@ impl<T: LogParser> LogDatabase<T> {
} }
} }
// TODO: implemnt as generic for LogValue types?? pub fn find_value_str(&self, value: &str) -> impl Iterator<Item = &FieldLocation> {
pub fn find_value_string(&self, value: &str) -> impl Iterator<Item = &FieldLocation> {
self.reverse_index.strings.get(value).into_iter().flatten() self.reverse_index.strings.get(value).into_iter().flatten()
} }
pub fn find_matching_str(&self, value: &str) -> impl Iterator<Item = &FieldLocation> {
// prefix matching
self.reverse_index.strings.subtrie(value).into_iter()
.flat_map(|subtrie| { subtrie.iter() })
.flat_map(|(_, locations)| {
locations.iter()
})
}
pub fn find_value_i64(&self, value: i64) -> impl Iterator<Item = &FieldLocation> {
self.reverse_index.integers.get(&value).into_iter().flatten()
}
// 1, 1.2, 1.5, 1.8, 2, 2.5, 12, 125
// 1
// 1, 1.2, 1.5, 1.8, 12, 125
// 2
// 2, 2.5
// pub fn find_value_f64_mantissa(&self, value: f64) -> impl Iterator<Item = &FieldLocation> {
// let bits = Bitsf64::from(value);
// let base10_digits = bits.mantissa
// let range = self.reverse_index.floats_mantissa.range((Bound::Included(bits.mantissa), Bound::Excluded(bits.mantissa + )));
// cursor.iter().flatten()
// }
pub fn find_value_bool(&self, value: bool) -> impl Iterator<Item = &FieldLocation> {
self.reverse_index.booleans.get(&value).into_iter().flatten()
}
pub fn find_value_null(&self) -> impl Iterator<Item = &FieldLocation> {
self.reverse_index.nulls.iter()
}
} }
struct ReverseIndex { struct ReverseIndex {
strings: HashMap<String, HashSet<FieldLocation>>, strings: Trie<String, BTreeSet<FieldLocation>>,
integers: BTreeMap<i64, HashSet<FieldLocation>>, integers: BTreeMap<i64, BTreeSet<FieldLocation>>,
floats: BTreeMap<OrderedFloat<f64>, HashSet<FieldLocation>>, floats: BTreeMap<OrderedFloat<f64>, BTreeSet<FieldLocation>>,
booleans: HashMap<bool, HashSet<FieldLocation>>, floats_mantissa: BTreeMap<u64, BTreeSet<FieldLocation>>,
booleans: HashMap<bool, BTreeSet<FieldLocation>>,
nulls: BTreeSet<FieldLocation>,
} }
#[derive(PartialEq, Eq, Hash, Debug)] #[derive(PartialEq, Eq, Hash, PartialOrd, Ord, Clone, Debug)]
pub struct FieldLocation { pub struct FieldLocation {
log_idx: LogIdx, pub log_idx: LogIdx,
property: Option<String>, pub property: Option<String>,
} }
impl ReverseIndex { impl ReverseIndex {
fn new() -> ReverseIndex { fn new() -> Self {
ReverseIndex { Self {
strings: HashMap::new(), strings: Trie::new(),
integers: BTreeMap::new(), integers: BTreeMap::new(),
floats: BTreeMap::new(), floats: BTreeMap::new(),
floats_mantissa: BTreeMap::new(),
booleans: HashMap::new(), booleans: HashMap::new(),
nulls: BTreeSet::new(),
} }
} }
} }
// float representations
const F64_EXPONENT_BITS: u32 = 11;
const F64_MANTISSA_BITS: u32 = 52;
struct Bitsf64 {
pub sign: bool,
pub exponent: u32,
pub mantissa: u64,
}
impl Bitsf64 {
fn from(f: f64) -> Self {
let bits = f.to_bits();
Self {
sign: ((bits >> 63) & 1) == 1,
exponent: ((bits >> F64_MANTISSA_BITS) & ((1<<F64_EXPONENT_BITS)-1)) as u32,
mantissa: bits & ((1<<F64_MANTISSA_BITS)-1),
}
}
}
struct f64MantissaOrd(pub Bitsf64);
impl Ord for f64MantissaOrd {
fn cmp(&self, other: &Self) -> Ordering {
(self.0.mantissa, self.0.exponent, self.0.sign).cmp(&(other.0.mantissa, other.0.exponent, other.0.sign))
}
}
impl PartialOrd for f64MantissaOrd {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for f64MantissaOrd {
fn eq(&self, other: &Self) -> bool {
(self.0.mantissa, self.0.exponent, self.0.sign) == (other.0.mantissa, other.0.exponent, other.0.sign)
}
}
impl Eq for f64MantissaOrd { }

View File

@@ -1,56 +1,21 @@
use crate::log_database::{Log, LogField, LogValue}; mod text_parser;
pub use text_parser::TextParser;
mod json_parser;
pub use json_parser::JsonParser;
use crate::log_database::{Log};
// Parses input into a string format
/// a Log consists of multiple keywords, which are indexed into the keyword database
/// each keyword has a data type, as well as text positional textual information for syntax highlighting
/// data can also be structured, so keywords need to know their position in the data structure for targetted queries
pub trait LogParser { pub trait LogParser {
fn parse_line(&self, line: &str) -> Result<Log, ParseError>; fn parse_line(&self, line: &str) -> Result<Log, ParseError>;
} }
#[derive(Debug)] #[derive(Debug)]
pub struct ParseError { pub struct ParseError {
pub message: &'static str, pub message: String,
pub col: u32, pub column: u32,
} }
// a Log consists of multiple keywords, which are indexed into the keyword database
// each keyword has a data type, as well as text positional textual information for syntax highlighting
// data can also be structured, so keywords need to know their position in the data structure for targetted queries
pub struct TextParser<'a> {
pub separator: Option<&'a str>,
}
/// Parses unstructured text word by word
impl<'a> LogParser for TextParser<'a> {
fn parse_line(&self, line: &str) -> Result<Log, ParseError> {
let mut split_iter;
let mut split_whitespace_iter;
let words: &mut dyn Iterator<Item = &str> = match self.separator {
Some(separator) => {
split_iter = line.split(separator);
&mut split_iter
},
None => {
split_whitespace_iter = line.split_whitespace();
&mut split_whitespace_iter
},
};
let fields: Vec<LogField> = words
.map(|word| LogField {
property: None,
value: LogValue::String(String::from(word))
}).collect();
Ok(Log {fields})
}
}
// pub struct JsonParser {
// }
// impl LogParser for JsonParser {
// fn parse_line(&self, line: &str) -> Result<Log, ParseError> {
// Ok(Log {})
// }
// }

View File

@@ -0,0 +1,80 @@
use std::collections::HashMap;
use serde_json;
use super::{LogParser, ParseError};
use crate::log_database::{Log, LogField, LogValue};
pub struct JsonParser {
}
/// Parses single-line JSON
impl LogParser for JsonParser {
fn parse_line(&self, line: &str) -> Result<Log, ParseError> {
let json_value: serde_json::Value = match serde_json::from_str(line) {
Ok(value) => value,
Err(error) => return Err(ParseError {
message: error.to_string(),
column: u32::try_from(error.column()).expect("column to fit in u32"),
}),
};
let log_value = serde_value_to_log_value(json_value)?;
let fields: Vec<LogField> = match log_value {
LogValue::Null
| LogValue::String(_)
| LogValue::Integer(_)
| LogValue::Float(_)
| LogValue::Boolean(_) => vec!(LogField {
property: None,
value: log_value,
}),
LogValue::Array(array) => array.into_iter().map(|v| LogField {
property: None,
value: v,
}).collect(),
LogValue::Map(map) => map.into_iter().map(|(k, v)| LogField {
property: Some(k),
value: v,
}).collect(),
};
Ok(Log {fields})
}
}
fn serde_value_to_log_value(json_value: serde_json::Value) -> Result<LogValue, ParseError> {
let value = match json_value {
serde_json::Value::String(value) => LogValue::String(value),
serde_json::Value::Null => LogValue::Null,
serde_json::Value::Number(value) => {
if let Some(as_int) = value.as_i64() { LogValue::Integer(as_int) }
else if let Some(as_float) = value.as_f64() { LogValue::Float(as_float) }
else { return Err(ParseError {
message: format!("Failed to parse number: {}", value.to_string()),
column: 0,
// TODO: update with serde_json spanned values
// https://github.com/serde-rs/json/issues/637
}) }
},
serde_json::Value::Bool(value) => LogValue::Boolean(value),
serde_json::Value::Array(values) => {
let mut array: Box<Vec<LogValue>> = Box::new(Vec::new());
for value in values {
array.push(serde_value_to_log_value(value)?);
}
LogValue::Array(array)
},
serde_json::Value::Object(properties) => {
let mut map: Box<HashMap<String, LogValue>> = Box::new(HashMap::new());
for (key, value) in properties {
map.insert(key, serde_value_to_log_value(value)?);
}
LogValue::Map(map)
},
};
return Ok(value);
}

View File

@@ -0,0 +1,33 @@
use super::{LogParser, ParseError};
use crate::log_database::{Log, LogField, LogValue};
pub struct TextParser<'a> {
pub separator: Option<&'a str>,
}
/// Parses unstructured text word by word
impl LogParser for TextParser<'_> {
fn parse_line(&self, line: &str) -> Result<Log, ParseError> {
let mut split_iter;
let mut split_whitespace_iter;
let words: &mut dyn Iterator<Item = &str> = match self.separator {
Some(separator) => {
split_iter = line.split(separator);
&mut split_iter
},
None => {
split_whitespace_iter = line.split_whitespace();
&mut split_whitespace_iter
},
};
let fields: Vec<LogField> = words
.map(|word| LogField {
property: None,
value: LogValue::String(String::from(word))
}).collect();
Ok(Log {fields})
}
}

View File

@@ -2,7 +2,7 @@
// use atty::Stream; // use atty::Stream;
use sift::log_database::LogDatabase; use sift::log_database::LogDatabase;
use sift::log_parser::TextParser; use sift::log_parser::{TextParser, JsonParser};
fn main() { fn main() {
// if atty::is(Stream::Stdin) { // if atty::is(Stream::Stdin) {
@@ -16,13 +16,28 @@ fn main() {
// println!("{}", line); // println!("{}", line);
// } // }
let mut log_database = LogDatabase::new(TextParser { separator: None }); // let mut log_database = LogDatabase::new(TextParser { separator: None });
log_database.ingest(String::from("hello world")).unwrap(); // log_database.ingest(String::from("hello world")).unwrap();
log_database.ingest(String::from("have a good hello")).unwrap(); // log_database.ingest(String::from("have a good hello")).unwrap();
log_database.ingest(String::from("goodbye world")).unwrap(); // log_database.ingest(String::from("goodbye world")).unwrap();
println!("hello {:?}", log_database.find_value_string("hello").collect::<Vec<_>>()); // println!("hello {:?}", log_database.find_value_string("hello").collect::<Vec<_>>());
println!("have {:?}", log_database.find_value_string("have").collect::<Vec<_>>()); // println!("have {:?}", log_database.find_value_string("have").collect::<Vec<_>>());
println!("world {:?}", log_database.find_value_string("world").collect::<Vec<_>>()); // println!("world {:?}", log_database.find_value_string("world").collect::<Vec<_>>());
println!("elliot {:?}", log_database.find_value_string("elliot").collect::<Vec<_>>()); // println!("elliot {:?}", log_database.find_value_string("elliot").collect::<Vec<_>>());
let mut log_database = LogDatabase::new(JsonParser {});
log_database.ingest(String::from("null")).unwrap();
log_database.ingest(String::from("\"hello\"")).unwrap();
log_database.ingest(String::from("42")).unwrap();
log_database.ingest(String::from("3.14")).unwrap();
log_database.ingest(String::from("true")).unwrap();
log_database.ingest(String::from("{\"greeting\": \"hello\", \"name\": \"world\"}")).unwrap();
log_database.ingest(String::from("{\"greeting\": \"hello\", \"name\": \"elliot\"}")).unwrap();
println!("hello {:?}", log_database.find_value_str("hello").collect::<Vec<_>>());
println!("world {:?}", log_database.find_value_str("world").collect::<Vec<_>>());
println!("matching");
println!("hello {:?}", log_database.find_matching_str("hello").collect::<Vec<_>>());
} }