[英]Apache Flink: Performance issue when running many jobs
使用大量Flink SQL查詢(以下為100),Flink命令行客戶端在Yarn群集上失敗,並顯示“ JobManager在600000毫秒內未響應”,即該作業從未在群集上啟動。
select count (*), 'idnumber' as criteria, Environment, CollectedTimestamp,
EventTimestamp, RawMsg, Source
from structStream
where Environment='MyEnvironment' and Rule='MyRule' and LogType='MyLogType'
and Outcome='Success'
group by tumble(proctime, INTERVAL '1' SECOND), Environment,
CollectedTimestamp, EventTimestamp, RawMsg, Source
public static void main(String[] args) throws Exception {
FileSystems.newFileSystem(KafkaReadingStreamingJob.class
.getResource(WHITELIST_CSV).toURI(), new HashMap<>());
final StreamExecutionEnvironment streamingEnvironment = getStreamExecutionEnvironment();
final StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(streamingEnvironment);
final DataStream<Row> structStream = getKafkaStreamOfRows(streamingEnvironment);
tableEnv.registerDataStream("structStream", structStream);
tableEnv.scan("structStream").printSchema();
for (int i = 0; i < 100; i++) {
for (String query : Queries.sample) {
// Queries.sample has one query that is above.
Table selectQuery = tableEnv.sqlQuery(query);
DataStream<Row> selectQueryStream =
tableEnv.toAppendStream(selectQuery, Row.class);
selectQueryStream.print();
}
}
// execute program
streamingEnvironment.execute("Kafka Streaming SQL");
}
private static DataStream<Row> getKafkaStreamOfRows(StreamExecutionEnvironment environment) throws Exception {
Properties properties = getKafkaProperties();
// TestDeserializer deserializes the JSON to a ROW of string columns (515)
// and also adds a column for the raw message.
FlinkKafkaConsumer011 consumer = new
FlinkKafkaConsumer011(KAFKA_TOPIC_TO_CONSUME, new TestDeserializer(getRowTypeInfo()), properties);
DataStream<Row> stream = environment.addSource(consumer);
return stream;
}
private static RowTypeInfo getRowTypeInfo() throws Exception {
// This has 515 fields.
List<String> fieldNames = DDIManager.getDDIFieldNames();
fieldNames.add("rawkafka"); // rawMessage added by TestDeserializer
fieldNames.add("proctime");
// Fill typeInformationArray with StringType to all but the last field which is of type Time
.....
return new RowTypeInfo(typeInformationArray, fieldNamesArray);
}
private static StreamExecutionEnvironment getStreamExecutionEnvironment() throws IOException {
final StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
env.enableCheckpointing(60000);
env.setStateBackend(new FsStateBackend(CHECKPOINT_DIR));
env.setParallelism(725);
return env;
}
private static DataStream<Row> getKafkaStreamOfRows(StreamExecutionEnvironment environment) throws Exception {
Properties properties = getKafkaProperties();
// TestDeserializer deserializes the JSON to a ROW of string columns (515)
// and also adds a column for the raw message.
FlinkKafkaConsumer011 consumer = new FlinkKafkaConsumer011(KAFKA_TOPIC_TO_CONSUME, new TestDeserializer(getRowTypeInfo()), properties);
DataStream<Row> stream = environment.addSource(consumer);
return stream;
}
private static RowTypeInfo getRowTypeInfo() throws Exception {
// This has 515 fields.
List<String> fieldNames = DDIManager.getDDIFieldNames();
fieldNames.add("rawkafka"); // rawMessage added by TestDeserializer
fieldNames.add("proctime");
// Fill typeInformationArray with StringType to all but the last field which is of type Time
.....
return new RowTypeInfo(typeInformationArray, fieldNamesArray);
}
private static StreamExecutionEnvironment getStreamExecutionEnvironment() throws IOException {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
env.enableCheckpointing(60000);
env.setStateBackend(new FsStateBackend(CHECKPOINT_DIR));
env.setParallelism(725);
return env;
}
在我看來,這似乎是JobManager重載了太多並發運行的作業。 我建議將作業分配給更多的JobManagers / Flink群集。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.