简体   繁体   中英

Need to return Distinct fields from ElasticSearch without the full search result documents using C# and Nest

I have Logs stored in ElasticSearch and a Windows Application using C# and Nest which is executing searches against the ElasticSearch. The mapping in ElasticSearch is shown below:

"mappings": {
    "qns": {
        "properties": {
            "@timestamp": {
                "format": "strict_date_optional_time||epoch_millis",
                "type": "date"
            },
            "Error_Description": {
                "index": "not_analyzed",
                "type": "string"
            },
            "Thread_Id": {
                "index": "not_analyzed",
                "type": "string"
            },
            "Error_Description_Analyzed": {
                "type": "string"
            },
            "Error_Source": {
                "index": "not_analyzed",
                "type": "string"
            },
            "record": {
                "type": "string"
            },
            "@version": {
                "type": "string"
            },
            "Log_Level": {
                "type": "string"
            },
            "Record": {
                "type": "string"
            },
            "id": {
                "type": "long"
            },
            "Error_Source_Analyzed": {
                "type": "string"
            },
            "Timestamp": {
                "format": "strict_date_optional_time||epoch_millis",
                "type": "date"
            }
        }
    }
}

The corresponding C# class is as follows:

[ElasticsearchType(IdProperty = "Id", Name = "qns")]
public class QNS
{
    [Number(NumberType.Long, Name = "id")]
    public long Id { get; set; }

    [Date(Name = "Timestamp")]
    public DateTime Timestamp { get; set; }

    [String(Name = "Error_Description", Index = FieldIndexOption.NotAnalyzed)]
    public string ErrorDescriptionKeyword { get; set; }

    [String(Name = "Error_Description_Analyzed")]
    public string ErrorDescriptionAnalyzed { get; set; }

    [String(Name = "Error_Source", Index = FieldIndexOption.NotAnalyzed)]
    public string ErrorSourceKeyword { get; set; }

    [String(Name = "Error_Source_Analyzed")]
    public string ErrorSourceAnalyzed { get; set; }

    [String(Name = "Thread_Id", Index = FieldIndexOption.NotAnalyzed)]
    public string ThreadId { get; set; }

    [String(Name = "Log_Level")]
    public string LogLevel { get; set; }

    [String(Index = FieldIndexOption.NotAnalyzed)]
    public string Record { get; set; }
}

I need a way to search for distinct error records that falls within a datetime range and matches a certain range of patterns. While I am able to get the result, but I am also getting all the documents that satisfy the search while I only need the distinct error strings. For the Distinct query I am using FluentNest( https://github.com/hoonzis/fluentnest ). The code for retrieving the results is as follows:

    private List<string> FindDistinctErrorsByPatternAndTimeRangeInternal(DateTime fromDateTime, DateTime toDateTime, List<pattern> patterns, string indexName, string type)
    {
        var documents = new List<QNS>();

        var fromTime = fromDateTime.ToString(Constants.IndexSearch.ES_DATETIME_FORMAT);
        var toTime = toDateTime.ToString(Constants.IndexSearch.ES_DATETIME_FORMAT);

        var patternQueries = new List<QueryContainer>();

        foreach (var p in patterns)
        {
            var pType = PatternType.unknown;
            if (Enum.TryParse<PatternType>(p.Pattern_Type.ToLowerInvariant(), out pType))
            {
                switch (pType)
                {
                    case PatternType.word:
                        patternQueries.Add(Query<QNS>.Regexp(r =>
                            r.Field(f =>
                                f.ErrorDescriptionAnalyzed)
                                .Value(p.Pattern_Description)
                            )
                       );
                        break;
                    case PatternType.phrase:
                        patternQueries.Add(Query<QNS>.MatchPhrase(m =>
                            m.Field(f =>
                                f.ErrorDescriptionAnalyzed)
                                .Query(p.Pattern_Description)
                            )
                        );
                        break;
                    case PatternType.unknown:
                    default:
                        break;
                }
            }
        }

        var datetimeQuery = Query<QNS>.QueryString(q =>
                                q.DefaultField(f =>
                                    f.Timestamp).Query($"[{fromTime} TO {toTime}]")
                                );

        var searchResults = client.Search<QNS>(s => s.Index(indexName)
           .Type(type)
           .Query(q =>
               q.Filtered(f =>
                   f.Filter(fq =>
                       fq.Bool(b =>
                           b.MinimumShouldMatch(1).Should(patternQueries.ToArray())
                       )
                   )
                   .Query(qd =>
                       qd.Bool(b =>
                           b.Must(datetimeQuery)
                       )
                   )
               )
            )
           .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder))
           .Aggregations(agg => agg.DistinctBy(q => q.ErrorDescriptionKeyword)));

        var results = searchResults.Aggs.AsContainer<QNS>().GetDistinct(d => d.ErrorDescriptionKeyword);

        return results.ToList();
    }

I need to modify this code to only return the distinct error strings and not the entire result set. The number of hits from the query is around 3500 and only 2 distinct error strings are present. So it does not make sense to get all those records back as I am not going to use it. Can someone help me get to the right aggregation query using the date range and pattern regex/phrase match to only return the distinct error records using Nest or Nest/FluentNest.

I think you are looking for the terms aggregation.

But you whole query is a bit strange. Do you have some legacy requirements.

First thing you have two fields ErrorDescriptionAnalyzed and ErrorDescriptionKeyword are you making a different field just to have one Analyzed and one not? Why don't you use multi-fields

Second Filtered method has obsolete for some time.

Here is a quick sample that I hope will help

ElasticClient db = new ElasticClient(uri);

            db.DeleteIndex(indexName);

            var mappings = new CreateIndexDescriptor(indexName).Mappings(ms => ms.Map<A>(map => map.
                AutoMap().
                Properties(props =>
                    props.String(p =>
                        p.Name(a => a.Text).
                        Fields(fields =>
                            fields.String(pr => pr.Name("raw").NotAnalyzed()))))));

            db.CreateIndex(mappings);

            foreach (var item in Enumerable.Range(0, 10).Select(i => new A
            {
                Price1 = random.NextDouble() * 1000,
                Date = i % 3 == 0 ? new DateTime(1900, 1, 1) : DateTime.Now,
                Text = i % 2 == 0 ? "ABC" : "EFG"
            }))
            {
                db.Index(item, inx => inx.Index(indexName));
            }

            var toDate = DateTime.Now + TimeSpan.FromDays(1);
            var fromDate = DateTime.Now - TimeSpan.FromDays(30);

            var data = db.Search<A>(s => 
                s.Index(indexName)
                .Query(q=>
                        q.DateRange(r => r.Field(f => f.Date).GreaterThan(fromDate).LessThanOrEquals(toDate))
                        &&
                        (
                        //term query is for finding words by default all words are lowercase but you can set a different analyzer
                        q.Term(t => t.Field(f => f.Text).Value("ABC".ToLower()))
                        ||
//Raw field is not analysed so no need to lower case you can use you query here if you want
                        q.Term(t => t.Field("text.raw").Value("EFG"))
                        )
                ).Aggregations(aggr => aggr.Terms("distinct", aterm => aterm.Field("text.raw"))));

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM