First off, this is an amazing library and I really appreciate it. It's so well organized and documented!
The problem I'm having is this: I have indexed a corpus of 792 objects. The word "broker" appears in four of these objects, but does not appear to be indexed at all. When I build the index and search for the word "broker" it returns 0 search results.
I've experimented with just selecting the TopicContentSearchModels with the word "broker" and indexing those. When I do this, the word "broker" does get indexed and the search works properly, returning four search results.
I also experimented with sorting the original list of objects and building the index that way. When I do that, "broker" gets indexed and the search works properly, returning four search result records.
For reference I've included the search model as well as all my 'sandbox' code. I've serialized the search models and attached a file if you have any interest in reproducing the issue, as well as having some real-world data to play with.
public class TopicContentSearchModel
{
public int TopicId { get; set; }
public string TopicName { get; set; }
public string Content { get; set; }
}
private void SetUpTestForLifti()
{
//*** BEGIN STUFF FOR LIFTI testing
//load objects to be indexed
var jsonFile = File.ReadAllText("C:\\Users\\manderson\\Documents\\umdata.json");
var searchModels =
JsonConvert.DeserializeObject<System.Collections.Generic.List<TopicContentSearchModel>>(jsonFile);
//****SCENARIO 1
//**** This is the defect - the word 'broker' exists 4 times in this corpus
//**** but when the search is performed it is not found
var indexWithAllModels = new FullTextIndexBuilder<int>()
.WithDefaultTokenization(o => o.WithStemming())
.WithQueryParser(o => o.WithDefaultJoiningOperator(QueryTermJoinOperatorKind.Or))
.WithTextExtractor<XmlTextExtractor>()
.WithObjectTokenization<TopicContentSearchModel>(
itemOptions => itemOptions
.WithKey(c => c.TopicId)
.WithField("TopicName", f => f.TopicName)
.WithField("Content", f => f.Content))
.Build();
indexWithAllModels.AddRangeAsync<TopicContentSearchModel>(searchModels);
//Test Search A will have zero records - this is the issue
var searchAllModels = indexWithAllModels.Search("broker").ToList();
//Serialize search models for possible shipping off to Lifti author for help
var topicJson = JsonConvert.SerializeObject(searchModels);
var brokerTopics =
searchModels.Where(x =>
x.TopicName.IndexOf("broker", StringComparison.CurrentCultureIgnoreCase) != -1 ||
x.Content.IndexOf("broker", StringComparison.CurrentCultureIgnoreCase) != -1
).ToList();
var nonBrokerTopics = searchModels.Except(brokerTopics).ToList();
//For this test, add topics with "broker" first, then
//add other items one by one and see if the number
//of items found when searching for "broker" changes
var indexWithBrokersThenAddNonBrokerTopics = new FullTextIndexBuilder<int>()
.WithDefaultTokenization(o => o.WithStemming())
.WithQueryParser(o => o.WithDefaultJoiningOperator(QueryTermJoinOperatorKind.Or))
.WithTextExtractor<XmlTextExtractor>()
.WithObjectTokenization<TopicContentSearchModel>(
itemOptions => itemOptions
.WithKey(c => c.TopicId)
.WithField("TopicName", f => f.TopicName)
.WithField("Content", f => f.Content))
.Build();
//Add records with term 'broker'
indexWithBrokersThenAddNonBrokerTopics.AddRangeAsync(brokerTopics);
//add in each record that does not have broker, see if search returns other than 4
foreach (var nonBrokerTopic in nonBrokerTopics)
{
indexWithBrokersThenAddNonBrokerTopics.AddAsync(nonBrokerTopic);
var testAddingNonBrokerTopicToBrokerTopics = indexWithBrokersThenAddNonBrokerTopics.Search("broker").ToList();
if (testAddingNonBrokerTopicToBrokerTopics.Count != 4)
{
throw new Exception($"Look out non-broker topic {nonBrokerTopic.TopicId} threw up");
}
}
//for this test, add all topics without the word "broker"
//then add in the topics with the word broker one at a time
//and see if the index can find the words
var testIndex2 = new FullTextIndexBuilder<int>()
.WithDefaultTokenization(o => o.WithStemming())
.WithQueryParser(o => o.WithDefaultJoiningOperator(QueryTermJoinOperatorKind.Or))
.WithTextExtractor<XmlTextExtractor>()
.WithObjectTokenization<TopicContentSearchModel>(
itemOptions => itemOptions
.WithKey(c => c.TopicId)
.WithField("TopicName", f => f.TopicName)
.WithField("Content", f => f.Content))
.Build();
testIndex2.AddRangeAsync(nonBrokerTopics);
for (var index = 0; index < brokerTopics.Count; index++)
{
var brokerTopic = brokerTopics[index];
testIndex2.AddAsync(brokerTopic);
var testSearch2 = testIndex2.Search("broker").ToList();
if (testSearch2.Count != index + 1)
{
throw new Exception($"Look out non-broker topic {brokerTopic.TopicId} threw up");
}
}
}