简体   繁体   中英

AWS CloudSearch export/download data

I have about 1.5 million documents in an AWS CloudSearch index. It is costing me too much and I wish to migrate off the service. I have been unable to see how I can download or export my documents from the index. Is it possible?

For a similar need, I had to browse my entire CloudSearch domain (more than the 10000 limit) to generate a file.

I used a nodeJS script to handle that, like this:

var AWS = require('aws-sdk');
var fs = require('fs');

AWS.config.update({
    accessKeyId: '<yourAccessKey>', secretAccessKey: '<yourSecretAccessKey>',
    region: '<yourRegion>',endpoint: '<YourSearchDomainEndPoint>'
});

var batchSize = 5000; //Number of item on every search... Max:10000    
var compteur = 0;
var result = [];

var params = {query:""};
var cloudsearchdomain = new AWS.CloudSearchDomain(params);

function launchSearch(theContext) {
    process.stdout.write('Launch AWS.CloudSearch ');

    if (theContext==null) {
        process.stdout.write('initial request ... ');
    } else {        
        var current  = (theContext.start/batchSize) +2 ;
        var totalRun = (Math.ceil(theContext.found/batchSize  * 10) / 10) + 1;
        process.stdout.write('( ' + current + ' / ' + totalRun + ' )       ... ');
    }

    params = {
           query:"-aQueryStringImpossibleToFind",
           cursor: (theContext==null)?"initial":theContext.cursor,
           size:batchSize 
    };  

    var forCursor = new AWS.CloudSearchDomain(params);

    forCursor.search(params, function(err, data) {
        if (err) {
            console.log("Failed with params :" );
            console.log(err);
        } else {
            resultMessage = data;       
            compteur = compteur + data.hits.hit.length;
            for(var i=0;i<data.hits.hit.length;i++){
                result.push(data.hits.hit[i]
                });
            }   
        }   

        process.stdout.write(resultMessage.hits.hit.length + ' hits found.');

        if (resultMessage.hits.hit.length==0) {
            process.stdout.write(' Done.\n\nLet\'s create thte file...\n');
            writeTheFile(result);
        } else {
            process.stdout.write('\n');
            var myContext = {};
            myContext.cursor = resultMessage.hits.cursor;
            myContext.start = resultMessage.hits.start;
            myContext.found = resultMessage.hits.found;
            myContext.retrived = resultMessage.hits.hit.length;
            launchSearch(myContext);
        }
    });
}

function writeTheFile(myResult) {

    fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) {
        if(err) {
            return console.log(err);
        }
    });
    process.stdout.write("DONE : File '"+ process.argv[2] + "' generated  ( " + compteur + " elements ).\n");
}



 /*Check parameters*/
if (!process.argv[2]) {
     //console.log(process.argv);
    process.stdout.write('ERROR : the output filename is expected as argumment.\n');
    process.exit();
 } else {
    launchSearch();
}

This script has to be called from commandline : node script.js fileToCreate.json

Note : I don't know if this works correctly on a 1.5 millions documents searchdomain. The risk I forsee is the JSON variable size. So, this script has to be adapted (maybe a file write every 100 000 documents ?).

Nekloth

Amazon(仍然)没有提供从 Cloudsearch 域导出所有数据的方法,但是,自己编写一个实用程序来执行此操作并不困难。

just fixed a couple of things, full credit to @Nek's response https://stackoverflow.com/a/32119407/1894553


prerequisites, node + aws-sdk plugin

$ npm install aws-sdk

export-all.js

beware that in order to obtain a full dump with return: "_all_fields" param, this fields must have flag return enabled in the indexing options of the schema.

var AWS = require('aws-sdk');
var fs = require('fs');

AWS.config.update({
        accessKeyId: 'xx',
        secretAccessKey: 'xx',
        region: 'xx',
        endpoint: 'xxx'
});

var batchSize = 10000;
var compteur = 0;
var result = [];
var resultMessage = [];

var params = {query:""};
var cloudsearchdomain = new AWS.CloudSearchDomain(params);

function launchSearch(theContext) {
    process.stdout.write('Launch AWS.CloudSearch ');

    if (theContext==null) {
        process.stdout.write('initial request ... ');
    } else {
        var current  = (theContext.start/batchSize) +2 ;
        var totalRun = (Math.ceil(theContext.found/batchSize  * 10) / 10) + 1;
        process.stdout.write('( ' + current + ' / ' + totalRun + ' )       ... ');
    }

// https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/CloudSearchDomain.html#search-property
params = {
    query:"matchall",
    cursor: (theContext==null)?"initial":theContext.cursor,
    size:batchSize,
    queryParser: "structured",
    return: "_all_fields"
};
 
    var forCursor = new AWS.CloudSearchDomain(params);

    forCursor.search(params, function(err, data) {
        if (err) {
            console.log("Failed with params :" );
            console.log(err);
        } else {
            resultMessage = data;
            compteur = compteur + data.hits.hit.length;
            for(var i=0;i<data.hits.hit.length;i++){
                result.push(data.hits.hit[i]);
                };
            }


        process.stdout.write(resultMessage.hits.hit.length + ' hits found.');

        if (resultMessage.hits.hit.length==0) {
            process.stdout.write(' Done.\n\nLet\'s create thte file...\n');
            writeTheFile(result);
        } else {
            process.stdout.write('\n');
            var myContext = {};
            myContext.cursor = resultMessage.hits.cursor;
            myContext.start = resultMessage.hits.start;
            myContext.found = resultMessage.hits.found;
            myContext.retrived = resultMessage.hits.hit.length;
            launchSearch(myContext);
        }
    });
}

function writeTheFile(myResult) {

    fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) {
        if(err) {
            return console.log(err);
        }
    });
    process.stdout.write("DONE : File '"+ process.argv[2] + "' generated  ( " + compteur + " elements ).\n");
}



 /*Check parameters*/
if (!process.argv[2]) {
     //console.log(process.argv);
    process.stdout.write('ERROR : the output filename is expected as argument.\n');
    process.exit();
 } else {
    launchSearch();
}  

#execution

$ node export-all.js all-data.json

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM