如何处理大量的ElasticSearch Index操作？

Question

I have developed a backend for a photo sharing app, with Node.js. 我已经使用Node.js开发了一个照片共享应用程序的后端。
I use Firebase as the database. 我使用Firebase作为数据库。
Here is a piece of the screenshots. 这是一部分屏幕截图。 'Posts' section in Firebase Firebase中的“帖子”部分
I am going to index the 'Posts' section to ElasticSearch( https://www.firebase.com/blog/2014-01-02-queries-part-two.html ). 我将为ElasticSearch（ https://www.firebase.com/blog/2014-01-02-queries-part-two.html ）的“帖子”部分建立索引。 Here is the indexing codes: 以下是索引代码：

 // initialize our ElasticSearch API var client = new ElasticClient({ host: 'localhost', port: 9200 }); // listen for changes to Firebase data var fb = new Firebase('https://mydb.firebaseio.com/Posts'); fb.on('child_added', createOrUpdateIndex); fb.on('child_changed', createOrUpdateIndex); fb.on('child_removed', removeIndex); var index = 'firebase'; var type = 'post'; function createOrUpdateIndex(snap) { //var data = snap.val(); //console.log(data); client.index(index, type, snap.val(), snap.key()) .on('data', function(data) { console.log('indexed ', snap.key()); }) .on('error', function(err) { console.log(err); }).exec(); } function removeIndex(snap) { client.deleteDocument(index, type, snap.key(), function(error, data) { if( error ) console.error('failed to delete', snap.key(), error); else console.log('deleted', snap.key()); }); }

It is working well with hundreds of posts. 它可以很好地处理数百个帖子。 But with over 10K of Posts, it produces the errors in the ElasticSearch log window, like below: 但是，如果发布的帖子超过10K，则会在ElasticSearch日志窗口中产生错误，如下所示：

 [2016-04-07 16:15:32,851][WARN ][indices.cluster ] [Caretaker] [[firebase][1]] marking and sending shard failed due to [engine failure, reason [index]] java.nio.file.FileSystemException: /Users/user/Downloads/elasticsearch-2.3.1/data/elasticsearch/nodes/0/indices/firebase/1/index/_a.fdt: Too many open files in system at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91) at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102) at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107) at sun.nio.fs.UnixFileSystemProvider.newByteChannel(UnixFileSystemProvider.java:214) at java.nio.file.spi.FileSystemProvider.newOutputStream(FileSystemProvider.java:430) at java.nio.file.Files.newOutputStream(Files.java:172) at org.apache.lucene.store.FSDirectory$FSIndexOutput.<init>(FSDirectory.java:271) at org.apache.lucene.store.FSDirectory.createOutput(FSDirectory.java:224) at org.apache.lucene.store.FileSwitchDirectory.createOutput(FileSwitchDirectory.java:155) at org.apache.lucene.store.RateLimitedFSDirectory.createOutput(RateLimitedFSDirectory.java:40) at org.apache.lucene.store.FilterDirectory.createOutput(FilterDirectory.java:73) at org.apache.lucene.store.LockValidatingDirectoryWrapper.createOutput(LockValidatingDirectoryWrapper.java:44) at org.apache.lucene.store.TrackingDirectoryWrapper.createOutput(TrackingDirectoryWrapper.java:43) at org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.<init>(CompressingStoredFieldsWriter.java:111) at org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat.fieldsWriter(CompressingStoredFieldsFormat.java:128) at org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.fieldsWriter(Lucene50StoredFieldsFormat.java:183) at org.apache.lucene.index.DefaultIndexingChain.initStoredFieldsWriter(DefaultIndexingChain.java:81) at org.apache.lucene.index.DefaultIndexingChain.startStoredFields(DefaultIndexingChain.java:279) at org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:316) at org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:234) at org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:450) at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1477) at org.elasticsearch.index.engine.InternalEngine.innerIndex(InternalEngine.java:541) at org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:457) at org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:601) at org.elasticsearch.index.engine.Engine$Index.execute(Engine.java:836) at org.elasticsearch.action.index.TransportIndexAction.executeIndexRequestOnPrimary(TransportIndexAction.java:237) at org.elasticsearch.action.index.TransportIndexAction.shardOperationOnPrimary(TransportIndexAction.java:158) at org.elasticsearch.action.index.TransportIndexAction.shardOperationOnPrimary(TransportIndexAction.java:66) at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryPhase.doRun(TransportReplicationAction.java:639) at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:279) at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:271) at org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:75) at org.elasticsearch.transport.TransportService$4.doRun(TransportService.java:376) at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745)

I have struggled for this error and now I see it is because of maximum number of opened files in the ElasticSearch module when a great number of 'child_added' events are triggered in a short time. 我一直在为这个错误而苦苦挣扎，现在我发现这是因为在短时间内触发大量“ child_added”事件时，ElasticSearch模块中已打开文件的最大数量。

I think I need to cache the Index operations to avoid this error keeping the limit of opened files as the default value. 我认为我需要缓存索引操作，以避免此错误将打开文件的限制保留为默认值。 How can I do that? 我怎样才能做到这一点？

Answer 1

You may use a Semaphore to limit the number of running index action to a specified capacity. 您可以使用信号量将正在运行的索引操作的数量限制为指定的容量。 Look at my post : https://stackoverflow.com/a/37456691/2733216 看我的帖子： https : //stackoverflow.com/a/37456691/2733216

The code is not tested, but should work. 该代码未经测试，但应该可以工作。

// initialize our ElasticSearch API
var client = new ElasticClient({ host: 'localhost', port: 9200 });

// listen for changes to Firebase data
var fb = new Firebase('https://mydb.firebaseio.com/Posts');
fb.on('child_added',   createOrUpdateIndex);
fb.on('child_changed', createOrUpdateIndex);
fb.on('child_removed', removeIndex);
var index = 'firebase';
var type = 'post';

// Create a semaphore of capacity 1
var semaphore = require ('semaphore');

function createOrUpdateIndex(snap) {
   semaphore.take(function () {
   //var data = snap.val();
   //console.log(data);
   client.index(index, type, snap.val(), snap.key())
     .on('data', function(data) {           
        semaphore.leave();
        console.log('indexed ', snap.key());
     })
     .on('error', function(err) {
         semaphore.leave();
         console.log(err);
     }).exec();
   });
}

function removeIndex(snap) {
   client.deleteDocument(index, type, snap.key(), function(error, data) {
      if( error ) console.error('failed to delete', snap.key(), error);
      else console.log('deleted', snap.key());
   });
}

Then following your system concurrency capacity, you may adjust 1 to x 然后，按照系统并发容量，您可以将1调整为x

如何处理大量的ElasticSearch Index操作？

问题描述

1 个解决方案

解决方案1
0 2016-05-26 14:19:03

如何处理大量的ElasticSearch Index操作？

问题描述

1 个解决方案

解决方案1 0 2016-05-26 14:19:03

解决方案1
0 2016-05-26 14:19:03