From e17bd5c8c1ba226dee4203245e450f80abb495ae Mon Sep 17 00:00:00 2001 From: Ibiyemi Abiodun Date: Fri, 9 May 2025 01:48:59 -0400 Subject: [PATCH 1/4] feat(snippet): report range of snippets in original text --- examples/snippet.rs | 7 +- src/snippet/mod.rs | 157 ++++++++++++++++++++++++++++++-------------- 2 files changed, 114 insertions(+), 50 deletions(-) diff --git a/examples/snippet.rs b/examples/snippet.rs index 31bd2c166f..cc74424cda 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -59,8 +59,11 @@ fn main() -> tantivy::Result<()> { let snippet = snippet_generator.snippet_from_doc(&doc); println!("Document score {score}:"); println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap()); - println!("snippet: {}", snippet.to_html()); - println!("custom highlighting: {}", highlight(snippet)); + + if let Some(snippet) = snippet { + println!("snippet: {}", snippet.to_html()); + println!("custom highlighting: {}", highlight(snippet)); + } } Ok(()) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 020e6b588a..43e65cd884 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -115,6 +115,7 @@ impl FragmentCandidate { #[derive(Debug)] pub struct Snippet { fragment: String, + fragment_range: Range, highlighted: Vec>, snippet_prefix: String, snippet_postfix: String, @@ -122,30 +123,16 @@ pub struct Snippet { impl Snippet { /// Create a new `Snippet`. - fn new(fragment: &str, highlighted: Vec>) -> Self { + fn new(source_str: &str, source_range: Range, highlighted: Vec>) -> Self { Self { - fragment: fragment.to_string(), + fragment: source_str[source_range.clone()].to_string(), + fragment_range: source_range, highlighted, snippet_prefix: DEFAULT_SNIPPET_PREFIX.to_string(), snippet_postfix: DEFAULT_SNIPPET_POSTFIX.to_string(), } } - /// Create a new, empty, `Snippet`. - pub fn empty() -> Snippet { - Snippet { - fragment: String::new(), - highlighted: Vec::new(), - snippet_prefix: String::new(), - snippet_postfix: String::new(), - } - } - - /// Returns `true` if the snippet is empty. - pub fn is_empty(&self) -> bool { - self.highlighted.len() == 0 - } - /// Returns a highlighted html from the `Snippet`. pub fn to_html(&self) -> String { let mut html = String::new(); @@ -169,6 +156,12 @@ impl Snippet { &self.fragment } + /// Returns the range of the original text that the fragment was extracted + /// from. + pub fn range(&self) -> Range { + self.fragment_range.clone() + } + /// Returns a list of highlighted positions from the `Snippet`. pub fn highlighted(&self) -> &[Range] { &self.highlighted @@ -231,7 +224,10 @@ fn search_fragments( /// /// Takes a vector of `FragmentCandidate`s and the text. /// Figures out the best fragment from it and creates a snippet. -fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) -> Snippet { +fn select_best_fragment_combination( + fragments: &[FragmentCandidate], + text: &str, +) -> Option { let best_fragment_opt = fragments.iter().max_by(|left, right| { let cmp_score = left .score @@ -243,18 +239,21 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) cmp_score } }); - if let Some(fragment) = best_fragment_opt { - let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; - let highlighted = fragment - .highlighted - .iter() - .map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset) - .collect(); - Snippet::new(fragment_text, highlighted) - } else { - // When there are no fragments to chose from, - // for now create an empty snippet. - Snippet::empty() + match best_fragment_opt { + Some(fragment) => { + let highlighted = fragment + .highlighted + .iter() + .map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset) + .collect(); + + Some(Snippet::new( + text, + fragment.start_offset..fragment.stop_offset, + highlighted, + )) + } + None => None, } } @@ -445,7 +444,7 @@ impl SnippetGenerator { /// /// This method extract the text associated with the `SnippetGenerator`'s field /// and computes a snippet. - pub fn snippet_from_doc(&self, doc: &D) -> Snippet { + pub fn snippet_from_doc(&self, doc: &D) -> Option { let mut text = String::new(); for (field, value) in doc.iter_fields_and_values() { let value = value as D::Value<'_>; @@ -463,13 +462,14 @@ impl SnippetGenerator { } /// Generates a snippet for the given text. - pub fn snippet(&self, text: &str) -> Snippet { + pub fn snippet(&self, text: &str) -> Option { let fragment_candidates = search_fragments( &mut self.tokenizer.clone(), text, &self.terms_text, self.max_num_chars, ); + select_best_fragment_combination(&fragment_candidates[..], text) } } @@ -520,7 +520,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.score, 1.9); assert_eq!(first.stop_offset, 89); } - let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); + let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap(); assert_eq!( snippet.fragment, "Rust is a systems programming language sponsored by\nMozilla which describes it as a \ @@ -551,7 +551,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.score, 1.0); assert_eq!(first.stop_offset, 17); } - let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); + let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap(); assert_eq!(snippet.to_html(), "Rust is a systems") } { @@ -571,7 +571,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.score, 0.9); assert_eq!(first.stop_offset, 17); } - let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); + let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap(); assert_eq!(snippet.to_html(), "programming language") } } @@ -594,7 +594,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.stop_offset, 7); } - let snippet = select_best_fragment_combination(&fragments[..], text); + let snippet = select_best_fragment_combination(&fragments[..], text).unwrap(); assert_eq!(snippet.fragment, "c d"); assert_eq!(snippet.to_html(), "c d"); } @@ -617,7 +617,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.start_offset, 8); } - let snippet = select_best_fragment_combination(&fragments[..], text); + let snippet = select_best_fragment_combination(&fragments[..], text).unwrap(); assert_eq!(snippet.fragment, "e f"); assert_eq!(snippet.to_html(), "e f"); } @@ -641,7 +641,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.start_offset, 0); } - let snippet = select_best_fragment_combination(&fragments[..], text); + let snippet = select_best_fragment_combination(&fragments[..], text).unwrap(); assert_eq!(snippet.fragment, "e f g"); assert_eq!(snippet.to_html(), "e f g"); } @@ -659,9 +659,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(fragments.len(), 0); let snippet = select_best_fragment_combination(&fragments[..], text); - assert_eq!(snippet.fragment, ""); - assert_eq!(snippet.to_html(), ""); - assert!(snippet.is_empty()); + assert!(snippet.is_none()); } #[test] @@ -674,9 +672,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(fragments.len(), 0); let snippet = select_best_fragment_combination(&fragments[..], text); - assert_eq!(snippet.fragment, ""); - assert_eq!(snippet.to_html(), ""); - assert!(snippet.is_empty()); + assert!(snippet.is_none()); } #[test] @@ -751,7 +747,7 @@ Survey in 2016, 2017, and 2018."#; let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field).unwrap(); { - let snippet = snippet_generator.snippet(TEST_TEXT); + let snippet = snippet_generator.snippet(TEST_TEXT).unwrap(); assert_eq!( snippet.to_html(), "imperative-procedural paradigms. Rust is syntactically similar to \ @@ -761,7 +757,7 @@ Survey in 2016, 2017, and 2018."#; } { snippet_generator.set_max_num_chars(90); - let snippet = snippet_generator.snippet(TEST_TEXT); + let snippet = snippet_generator.snippet(TEST_TEXT).unwrap(); assert_eq!( snippet.to_html(), "Rust is syntactically similar to C++[according to whom?],\nbut its \ @@ -794,7 +790,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.stop_offset, 3); } - let snippet = select_best_fragment_combination(&fragments[..], text); + let snippet = select_best_fragment_combination(&fragments[..], text).unwrap(); assert_eq!(snippet.fragment, "abc"); assert_eq!(snippet.to_html(), "abc"); } @@ -808,7 +804,7 @@ Survey in 2016, 2017, and 2018."#; &terms, 100, ); - let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); + let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap(); assert_eq!( snippet.to_html(), "Rust is a systems programming language sponsored by\nMozilla which \ @@ -822,6 +818,71 @@ Survey in 2016, 2017, and 2018."#; ); } + #[test] + fn test_snippet_absolute_offsets() { + let text = "First sentence. The quick brown fox jumps over the lazy dog. Last sentence."; + let terms = btreemap! { + String::from("fox") => 1.0, + String::from("dog") => 0.9 + }; + + let fragments = search_fragments( + &mut From::from(SimpleTokenizer::default()), + text, + &terms, + 100, + ); + + let snippet = select_best_fragment_combination(&fragments[..], text).unwrap(); + + // verify fragment range points to correct substring + // max_num_chars is 100, so our fragment should be the entire text + assert_eq!(snippet.fragment_range, 0..text.len() - 1); + assert_eq!(&text[snippet.fragment_range.clone()], snippet.fragment); + + // verify highlighted ranges are correct relative to original text + let absolute_highlights: Vec<&str> = snippet.highlighted + .iter() + .map(|highlight| (highlight.start + snippet.fragment_range.start)..(highlight.end + snippet.fragment_range.start)) + .map(|range| &text[range]) + .collect(); + + // "fox" and "dog" positions in original text + assert!(absolute_highlights.contains(&"fox")); // "fox" + assert!(absolute_highlights.contains(&"dog")); // "dog" + } + + #[test] + fn test_snippet_absolute_offsets_with_truncation() { + let text = "Intro text. The quick brown fox jumps over the lazy dog. The quick brown fox jumps again. End text."; + let terms = btreemap! { + String::from("fox") => 1.0, + String::from("quick") => 0.9 + }; + + let fragments = search_fragments( + &mut From::from(SimpleTokenizer::default()), + text, + &terms, + 30, // short max chars to force truncation + ); + + let snippet = select_best_fragment_combination(&fragments[..], text).unwrap(); + + // verify fragment range points to correct substring + assert_eq!(&text[snippet.fragment_range.clone()], snippet.fragment); + + // verify highlighted ranges are correct relative to original text + let absolute_highlights: Vec<&str> = snippet.highlighted + .iter() + .map(|range| (range.start + snippet.fragment_range.start)..(range.end + snippet.fragment_range.start)) + .map(|range| &text[range]) + .collect(); + + assert!(absolute_highlights.contains(&"quick")); // "quick" + assert!(absolute_highlights.contains(&"fox")); // "fox" + } + #[test] fn test_collapse_overlapped_ranges() { #![allow(clippy::single_range_in_vec_init)] From 60d9d4b2f1649ace1df1a1f53b62acc15a4e6f3b Mon Sep 17 00:00:00 2001 From: Ibiyemi Abiodun Date: Fri, 9 May 2025 01:55:36 -0400 Subject: [PATCH 2/4] multi snippets --- src/snippet/mod.rs | 76 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 12 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 43e65cd884..cf550ca9e6 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -440,11 +440,7 @@ impl SnippetGenerator { &self.terms_text } - /// Generates a snippet for the given `Document`. - /// - /// This method extract the text associated with the `SnippetGenerator`'s field - /// and computes a snippet. - pub fn snippet_from_doc(&self, doc: &D) -> Option { + fn text_from_doc(&self, doc: &D) -> String { let mut text = String::new(); for (field, value) in doc.iter_fields_and_values() { let value = value as D::Value<'_>; @@ -458,7 +454,23 @@ impl SnippetGenerator { } } - self.snippet(text.trim()) + text + } + + /// Generates a snippet for the given `Document`. + /// + /// This method extract the text associated with the `SnippetGenerator`'s field + /// and computes a snippet. + pub fn snippet_from_doc(&self, doc: &D) -> Option { + self.snippet(self.text_from_doc(doc).trim()) + } + + /// Generates snippets for the given `Document`. + /// + /// This method extract the text associated with the `SnippetGenerator`'s field + /// and computes snippets. + pub fn snippets_from_doc(&self, doc: &D) -> Vec { + self.snippets(self.text_from_doc(doc).trim()) } /// Generates a snippet for the given text. @@ -472,6 +484,38 @@ impl SnippetGenerator { select_best_fragment_combination(&fragment_candidates[..], text) } + + /// Generates a snippet for the given text. + pub fn snippets(&self, text: &str) -> Vec { + let fragment_candidates = search_fragments( + &mut self.tokenizer.clone(), + text, + &self.terms_text, + self.max_num_chars, + ); + + let snippets = fragment_candidates + .iter() + .filter(|f| f.score > 0.0) + .map(|fragment| { + let highlighted = fragment + .highlighted + .iter() + .map(|item| { + item.start - fragment.start_offset..item.end - fragment.start_offset + }) + .collect(); + + Snippet::new( + text, + fragment.start_offset..fragment.stop_offset, + highlighted, + ) + }) + .collect(); + + snippets + } } #[cfg(test)] @@ -834,16 +878,20 @@ Survey in 2016, 2017, and 2018."#; ); let snippet = select_best_fragment_combination(&fragments[..], text).unwrap(); - + // verify fragment range points to correct substring // max_num_chars is 100, so our fragment should be the entire text assert_eq!(snippet.fragment_range, 0..text.len() - 1); assert_eq!(&text[snippet.fragment_range.clone()], snippet.fragment); // verify highlighted ranges are correct relative to original text - let absolute_highlights: Vec<&str> = snippet.highlighted + let absolute_highlights: Vec<&str> = snippet + .highlighted .iter() - .map(|highlight| (highlight.start + snippet.fragment_range.start)..(highlight.end + snippet.fragment_range.start)) + .map(|highlight| { + (highlight.start + snippet.fragment_range.start) + ..(highlight.end + snippet.fragment_range.start) + }) .map(|range| &text[range]) .collect(); @@ -868,14 +916,18 @@ Survey in 2016, 2017, and 2018."#; ); let snippet = select_best_fragment_combination(&fragments[..], text).unwrap(); - + // verify fragment range points to correct substring assert_eq!(&text[snippet.fragment_range.clone()], snippet.fragment); // verify highlighted ranges are correct relative to original text - let absolute_highlights: Vec<&str> = snippet.highlighted + let absolute_highlights: Vec<&str> = snippet + .highlighted .iter() - .map(|range| (range.start + snippet.fragment_range.start)..(range.end + snippet.fragment_range.start)) + .map(|range| { + (range.start + snippet.fragment_range.start) + ..(range.end + snippet.fragment_range.start) + }) .map(|range| &text[range]) .collect(); From 7f7779708544bfe8dcf31c153973181d768b968f Mon Sep 17 00:00:00 2001 From: Ibiyemi Abiodun Date: Mon, 12 May 2025 09:16:56 -0400 Subject: [PATCH 3/4] cargo fmt --- src/snippet/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index cf550ca9e6..2e7c20ed15 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -902,7 +902,8 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_absolute_offsets_with_truncation() { - let text = "Intro text. The quick brown fox jumps over the lazy dog. The quick brown fox jumps again. End text."; + let text = "Intro text. The quick brown fox jumps over the lazy dog. The quick brown fox \ + jumps again. End text."; let terms = btreemap! { String::from("fox") => 1.0, String::from("quick") => 0.9 From a2f489799f0fae240343c192b305bb94a4402592 Mon Sep 17 00:00:00 2001 From: Ibiyemi Abiodun Date: Mon, 12 May 2025 09:19:00 -0400 Subject: [PATCH 4/4] fix doctest --- src/snippet/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 2e7c20ed15..e0031c0766 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -42,7 +42,7 @@ //! # let searcher = reader.searcher(); //! let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?; //! snippet_generator.set_max_num_chars(100); -//! let snippet = snippet_generator.snippet_from_doc(&doc); +//! let snippet = snippet_generator.snippet_from_doc(&doc).unwrap(); //! let snippet_html: String = snippet.to_html(); //! assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les haleurs :\n Des"); //! # Ok(()) @@ -367,7 +367,7 @@ fn is_sorted(mut it: impl Iterator) -> bool { /// # let searcher = reader.searcher(); /// let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?; /// snippet_generator.set_max_num_chars(100); -/// let snippet = snippet_generator.snippet_from_doc(&doc); +/// let snippet = snippet_generator.snippet_from_doc(&doc).unwrap(); /// let snippet_html: String = snippet.to_html(); /// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les haleurs :\n Des"); /// # Ok(())