[英]AWS CloudSearch export/download data
我在 AWS CloudSearch 索引中有大約 150 萬個文檔。 這讓我付出了太多代價,我希望遷移出該服務。 我一直無法看到如何從索引中下載或導出我的文檔。 有可能嗎?
對於類似的需求,我不得不瀏覽我的整個 CloudSearch 域(超過 10000 個限制)來生成一個文件。
我使用了一個 nodeJS 腳本來處理它,如下所示:
var AWS = require('aws-sdk');
var fs = require('fs');
AWS.config.update({
accessKeyId: '<yourAccessKey>', secretAccessKey: '<yourSecretAccessKey>',
region: '<yourRegion>',endpoint: '<YourSearchDomainEndPoint>'
});
var batchSize = 5000; //Number of item on every search... Max:10000
var compteur = 0;
var result = [];
var params = {query:""};
var cloudsearchdomain = new AWS.CloudSearchDomain(params);
function launchSearch(theContext) {
process.stdout.write('Launch AWS.CloudSearch ');
if (theContext==null) {
process.stdout.write('initial request ... ');
} else {
var current = (theContext.start/batchSize) +2 ;
var totalRun = (Math.ceil(theContext.found/batchSize * 10) / 10) + 1;
process.stdout.write('( ' + current + ' / ' + totalRun + ' ) ... ');
}
params = {
query:"-aQueryStringImpossibleToFind",
cursor: (theContext==null)?"initial":theContext.cursor,
size:batchSize
};
var forCursor = new AWS.CloudSearchDomain(params);
forCursor.search(params, function(err, data) {
if (err) {
console.log("Failed with params :" );
console.log(err);
} else {
resultMessage = data;
compteur = compteur + data.hits.hit.length;
for(var i=0;i<data.hits.hit.length;i++){
result.push(data.hits.hit[i]
});
}
}
process.stdout.write(resultMessage.hits.hit.length + ' hits found.');
if (resultMessage.hits.hit.length==0) {
process.stdout.write(' Done.\n\nLet\'s create thte file...\n');
writeTheFile(result);
} else {
process.stdout.write('\n');
var myContext = {};
myContext.cursor = resultMessage.hits.cursor;
myContext.start = resultMessage.hits.start;
myContext.found = resultMessage.hits.found;
myContext.retrived = resultMessage.hits.hit.length;
launchSearch(myContext);
}
});
}
function writeTheFile(myResult) {
fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) {
if(err) {
return console.log(err);
}
});
process.stdout.write("DONE : File '"+ process.argv[2] + "' generated ( " + compteur + " elements ).\n");
}
/*Check parameters*/
if (!process.argv[2]) {
//console.log(process.argv);
process.stdout.write('ERROR : the output filename is expected as argumment.\n');
process.exit();
} else {
launchSearch();
}
必須從命令行調用此腳本:node script.js fileToCreate.json
注意:我不知道這是否能在 150 萬個文檔搜索域上正常工作。 我預見的風險是 JSON 變量大小。 因此,必須修改此腳本(也許每 100 000 個文檔寫入一個文件?)。
內克洛斯
Amazon(仍然)沒有提供從 Cloudsearch 域導出所有數據的方法,但是,自己編寫一個實用程序來執行此操作並不困難。
剛剛修復了幾件事,完全歸功於@Nek 的回復https://stackoverflow.com/a/32119407/1894553
$ npm install aws-sdk
請注意,為了獲得帶有return: "_all_fields"
參數的完整轉儲,該字段必須在架構的索引選項中啟用標志return
。
var AWS = require('aws-sdk');
var fs = require('fs');
AWS.config.update({
accessKeyId: 'xx',
secretAccessKey: 'xx',
region: 'xx',
endpoint: 'xxx'
});
var batchSize = 10000;
var compteur = 0;
var result = [];
var resultMessage = [];
var params = {query:""};
var cloudsearchdomain = new AWS.CloudSearchDomain(params);
function launchSearch(theContext) {
process.stdout.write('Launch AWS.CloudSearch ');
if (theContext==null) {
process.stdout.write('initial request ... ');
} else {
var current = (theContext.start/batchSize) +2 ;
var totalRun = (Math.ceil(theContext.found/batchSize * 10) / 10) + 1;
process.stdout.write('( ' + current + ' / ' + totalRun + ' ) ... ');
}
// https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/CloudSearchDomain.html#search-property
params = {
query:"matchall",
cursor: (theContext==null)?"initial":theContext.cursor,
size:batchSize,
queryParser: "structured",
return: "_all_fields"
};
var forCursor = new AWS.CloudSearchDomain(params);
forCursor.search(params, function(err, data) {
if (err) {
console.log("Failed with params :" );
console.log(err);
} else {
resultMessage = data;
compteur = compteur + data.hits.hit.length;
for(var i=0;i<data.hits.hit.length;i++){
result.push(data.hits.hit[i]);
};
}
process.stdout.write(resultMessage.hits.hit.length + ' hits found.');
if (resultMessage.hits.hit.length==0) {
process.stdout.write(' Done.\n\nLet\'s create thte file...\n');
writeTheFile(result);
} else {
process.stdout.write('\n');
var myContext = {};
myContext.cursor = resultMessage.hits.cursor;
myContext.start = resultMessage.hits.start;
myContext.found = resultMessage.hits.found;
myContext.retrived = resultMessage.hits.hit.length;
launchSearch(myContext);
}
});
}
function writeTheFile(myResult) {
fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) {
if(err) {
return console.log(err);
}
});
process.stdout.write("DONE : File '"+ process.argv[2] + "' generated ( " + compteur + " elements ).\n");
}
/*Check parameters*/
if (!process.argv[2]) {
//console.log(process.argv);
process.stdout.write('ERROR : the output filename is expected as argument.\n');
process.exit();
} else {
launchSearch();
}
#執行
$ node export-all.js all-data.json
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.