简体   繁体   中英

Use mapReduce on a 'logs' collection to generate HTTP stream

MongoDB newbie issue:

I have a lot of HTTP logs stored into a collection with the following data structure:

{
    'client': {
        'ip_address': '1.2.3.4',
        'referrer':"http://....",
        'user_agent':'Mozilla..."
    },
    'request':{
        "stream": "stream1",
        "method": "GET",
        "fragment_id": 97,
        "date": 13482181,
    'response':{
        'status':200,
        'size': 654
    }
}

Each document describes an HTTP request (from a client to a content streamer). As each stream is fragmented into smaller pieces, I would like to use "mapReduce" on my collection and then create a "generic stream request" document, as below:

{
    'client_ip': '1.2.3.4',
    'user_agent': 'Mozilla',
    'streams':[
        {
        'stream':"stream1",
        'referrer':'http://...',
        'requests':[
          {
             'fragment_id':97,
             'status':200,
             'date': 13482181,
             'size': 654
             ...
          },
          {
             'fragment_id':98,
             'status':200,
             'date': 13482192,
             'size': 624
             ...
          }, [...]
         ]
        }, [...]
    ]

Here is what I tried:

map = function(){
    emit({client_ip:this.client.ip,user_agent:this.client.user_agent},{
                stream:this.request.stream,
                referrer:this.client.referer,
                status:this.response.status,
                date:this.request.date,
                size:this.response.total_size,
                fragment_id:this.request.fragment_infos[1]
    });
}

reduce = function(key,values){
    r = {'count':0,'request':[]};
    values.forEach(function(v){
        r.count += 1;
        r.request.push(v);
    });

    return r;
}

but here is what I get as a result:

"_id" : {
    "client_ip" : "1.2.3.4",
    "user_agent" : "Mozilla\/4.0"
 },
 "value" : {
    "client_ip" : "1.2.3.4",
    "user_agent" : "Mozilla\/4.0",
    "count" : 17,
    "request" : {
        "0" : {
            "client_ip" : "1.2.3.4",
            "user_agent" : "Mozilla\/4.0",
            "count" : 2,
            "request" : {
                "0" : {
                    "stream" : "stream1.isml",
                    "referrer" : null,
                    "status" : 200,
                    "date" : 1341706566,
                    "size" : 456,
                    "fragment_id" : null,
                    "count" : 1
                },
                "1" : {
                    "stream" : "stream1.isml",
                    "referrer" : null,
                    "status" : 200,
                    "date" : 1341706566,
                    "size" : null,
                    "fragment_id" : null,
                    "count" : 1
                }
            }
        },
        "1" : {
            "client_ip" : "1.2.3.4",
            "user_agent" : "Mozilla\/4.0",
            "count" : 3,
            "request" : {
                "0" : {
                    "client_ip" : "1.2.3.4",
                    "user_agent" : "Mozilla\/4.0",
                    "count" : 2,
                    "request" : {
                        "0" : {
                            "stream" : "stream1.isml",
                            "referrer" : null,
                            "status" : 200,
                            "date" : 1341706568,
                            "size" : null,
                            "fragment_id" : null,
                            "count" : 1
.........

Where am I wrong?

You will always end up with a record that contains _id and value, this is a property of MongoDB map/reduce. There is an open ticket to change this behavior: https://jira.mongodb.org/browse/SERVER-2517

As far as making the value line up with the your example, you want the output from your map function to be in the same form as the output you desire from your reduce function.

map = function(){

  emit({client_ip:this.client.ip,user_agent:this.client.user_agent},{
    client_ip: this.client.ip,
    user_agent: this.client.user_agent,
    streams: {
      this.request.stream: {
        referrer: this.client.referer,
        requests: [
          {
            fragment_id: this.request.fragment_infos[1],
            status:this.response.status,
            date:this.request.date,
            size:this.response.total_size  
          }
        ]
      }
    }
  });
}

You'll need to modify your reduce function to merge multiple documents of this form. If necessary, write a finalize function to convert the hash of streams to an array of streams with the stream name inside each element.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM