[英]Spring-batch partitioned step duplicated processing
我有數千條記錄要使用 spring-batch 處理,這需要太多時間來滿足我們的業務需求。 其邏輯很簡單,就是逐條讀取CSV文件並進行處理。
我想並行化步驟執行以加快批處理速度。 我認為最好的選擇是對步驟進行分區並執行假設在一個線程中執行 1-100 條記錄,在另一個線程中執行 101-200 條記錄,依此類推。 但相反,每個線程都處理所有記錄。
同樣的情況也發生在這個非常簡單的例子中:
@Configuration
@RequiredArgsConstructor
public class JobConfig {
private final JobBuilderFactory jobBuilderFactory;
private final StepBuilderFactory stepBuilderFactory;
@Value("classpath:employees.csv")
private Resource resourceFile;
@Bean("MyJob1")
public Job createJob() {
return jobBuilderFactory.get("my job 1")
.incrementer(new RunIdIncrementer())
.start(step())
.build();
}
@Bean("MyStep1")
public Step step() {
return stepBuilderFactory.get("my step 1")
.partitioner("my step 1", new SimplePartitioner())
.partitionHandler(partitionHandler())
.build();
}
@Bean("slaveStep")
public Step slaveStep() {
return stepBuilderFactory.get("read csv stream")
.<Employee, Employee>chunk(1)
.reader(flatFileItemReader())
.processor((ItemProcessor<Employee, Employee>) employee -> {
System.out.printf("Processed item %s%n", employee.getId());
return employee;
})
.writer(list -> {
for (Employee item : list) {
System.out.println(item);
}
})
.build();
}
@StepScope
@Bean
public FlatFileItemReader<Employee> flatFileItemReader() {
FlatFileItemReader<Employee> reader = new FlatFileItemReader<>();
reader.setResource(resourceFile);
DefaultLineMapper<Employee> lineMapper = new DefaultLineMapper<>();
lineMapper.setFieldSetMapper(fieldSet -> {
String[] values = fieldSet.getValues();
return Employee.builder()
.id(Integer.parseInt(values[0]))
.firstName(values[1])
.build();
});
lineMapper.setLineTokenizer(new DelimitedLineTokenizer(";"));
reader.setLineMapper(lineMapper);
return reader;
}
@Bean
public PartitionHandler partitionHandler() {
TaskExecutorPartitionHandler taskExecutorPartitionHandler = new TaskExecutorPartitionHandler();
taskExecutorPartitionHandler.setTaskExecutor(taskExecutor());
taskExecutorPartitionHandler.setStep(slaveStep());
taskExecutorPartitionHandler.setGridSize(5);
return taskExecutorPartitionHandler;
}
@Bean
public TaskExecutor taskExecutor() {
ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
taskExecutor.setMaxPoolSize(5);
taskExecutor.setCorePoolSize(5);
taskExecutor.setQueueCapacity(5);
taskExecutor.afterPropertiesSet();
return taskExecutor;
}
}
@SpringBootApplication
@EnableBatchProcessing
public class SpringBatchTestsApplication implements CommandLineRunner {
private final JobLauncher jobLauncher;
private final Job job;
public SpringBatchTestsApplication(JobLauncher jobLauncher,
@Qualifier("MyJob1") Job job) {
this.jobLauncher = jobLauncher;
this.job = job;
}
public static void main(String[] args) {
SpringApplication.run(SpringBatchTestsApplication.class, args);
}
@Override
public void run(String... args) throws Exception {
jobLauncher.run(job, new JobParameters());
}
}
@Value
@Builder
public class Employee {
private final int id;
private final String firstName;
}
員工。csv:
1;Jakub
2;Mike
3;Pawel
4;Joana
5;Michal
6;Joe
7;Bailey
8;Bailhache
9;John
10;Eva
示例(示例因為順序不重要)預期為 5 個線程(gridSize)的 output:
Processed item 1
Employee(id=1, firstName=Jakub)
Processed item 2
Employee(id=2, firstName=Mike)
Processed item 3
Employee(id=3, firstName=Pawel)
Processed item 4
Employee(id=4, firstName=Joana)
Processed item 5
Employee(id=5, firstName=Michal)
Processed item 6
Employee(id=6, firstName=Joe)
Processed item 7
Employee(id=7, firstName=Bailey)
Processed item 8
Employee(id=8, firstName=Bailhache)
Processed item 9
Employee(id=9, firstName=John)
Processed item 10
Employee(id=10, firstName=Eva)
實際的output是以上出現5次。
這應該是查看分區器的解決方案,他將負責計算文件的行數並將它們分配給讀者,並使用@Value("#{stepExecutionContext['fromLine']}")
注入讀者要閱讀的正確行。
請使用注入而不是調用方法。
@Configuration
@RequiredArgsConstructor
public class JobConfig {
private static final Logger log = LoggerFactory.getLogger(JobConfig.class);
private final JobBuilderFactory jobBuilderFactory;
private final StepBuilderFactory stepBuilderFactory;
@Value(value = "classpath:employees.csv")
private Resource resource;
@Bean("MyJob1")
public Job createJob(@Qualifier("MyStep1") Step stepMaster) {
return jobBuilderFactory.get("MyJob1")
.incrementer(new RunIdIncrementer())
.start(stepMaster)
.build();
}
@Bean("MyStep1")
public Step step(PartitionHandler partitionHandler, Partitioner partitioner) {
return stepBuilderFactory.get("MyStep1")
.partitioner("slaveStep", partitioner)
.partitionHandler(partitionHandler)
.build();
}
@Bean("slaveStep")
public Step slaveStep(FlatFileItemReader<Employee> reader) {
return stepBuilderFactory.get("slaveStep")
.<Employee, Employee>chunk(1)
.reader(reader)
.processor((ItemProcessor<Employee, Employee>) employee -> {
System.out.printf("Processed item %s%n", employee.getId());
return employee;
})
.writer(list -> {
for (Employee item : list) {
System.out.println(item);
}
})
.build();
}
@Bean
public Partitioner partitioner() {
return gridSize -> {
Map<String, ExecutionContext> result = new HashMap<>();
int lines = 0;
try(BufferedReader reader = new BufferedReader(new InputStreamReader(resource.getInputStream()))) {
while (reader.readLine() != null) lines++;
} catch (IOException e) {
throw new RuntimeException(e);
}
int range = lines / gridSize;
int remains = lines % gridSize;
int fromLine = 0;
int toLine = range;
for (int i = 1; i <= gridSize; i++) {
if(i == gridSize) {
toLine += remains;
}
ExecutionContext value = new ExecutionContext();
value.putInt("fromLine", fromLine);
value.putInt("toLine", toLine);
fromLine = toLine;
toLine += range;
result.put("partition" + i, value);
}
return result;
};
}
@StepScope
@Bean
public FlatFileItemReader<Employee> flatFileItemReader(@Value("#{stepExecutionContext['fromLine']}") int startLine, @Value("#{stepExecutionContext['toLine']}") int lastLine) {
FlatFileItemReader<Employee> reader = new FlatFileItemReader<>();
reader.setResource(resource);
DefaultLineMapper<Employee> lineMapper = new DefaultLineMapper<>();
lineMapper.setFieldSetMapper(fieldSet -> {
String[] values = fieldSet.getValues();
return Employee.builder()
.id(Integer.parseInt(values[0]))
.firstName(values[1])
.build();
});
lineMapper.setLineTokenizer(new DelimitedLineTokenizer(";"));
reader.setLineMapper(lineMapper);
reader.setCurrentItemCount(startLine);
reader.setMaxItemCount(lastLine);
return reader;
}
@Bean
public PartitionHandler partitionHandler(@Qualifier("slaveStep") Step step, TaskExecutor taskExecutor) {
TaskExecutorPartitionHandler taskExecutorPartitionHandler = new TaskExecutorPartitionHandler();
taskExecutorPartitionHandler.setTaskExecutor(taskExecutor);
taskExecutorPartitionHandler.setStep(step);
taskExecutorPartitionHandler.setGridSize(5);
return taskExecutorPartitionHandler;
}
@Bean
public TaskExecutor taskExecutor() {
ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
taskExecutor.setMaxPoolSize(5);
taskExecutor.setCorePoolSize(5);
taskExecutor.setQueueCapacity(5);
taskExecutor.afterPropertiesSet();
return taskExecutor;
}
}
我不知道我的解決方案是否是最好的,但它是有效的。
我在JobConfig
class 中添加了兩個全局參數:
private int startLine = -1;
private int lastLine = 1;
然后我在flatFileItemReader
方法中將這些參數設置為 reader:
reader.setCurrentItemCount(startLine);
reader.setMaxItemCount(lastLine);
startLine+=2;
lastLine+=2;
所有代碼:
@Configuration
@RequiredArgsConstructor
public class JobConfig {
private final JobBuilderFactory jobBuilderFactory;
private final StepBuilderFactory stepBuilderFactory;
private int startLine = -1;
private int lastLine = 1;
@Value("classpath:employees.csv")
private Resource resourceFile;
@Bean("MyJob1")
public Job createJob() {
return jobBuilderFactory.get("my job 1")
.incrementer(new RunIdIncrementer())
.start(step())
.build();
}
@Bean("MyStep1")
public Step step() {
return stepBuilderFactory.get("my step 1")
.partitioner("my step 1", new SimplePartitioner())
.partitionHandler(partitionHandler())
.build();
}
@Bean("slaveStep")
public Step slaveStep() {
return stepBuilderFactory.get("read csv stream")
.<Employee, Employee>chunk(1)
.reader(flatFileItemReader())
.processor((ItemProcessor<Employee, Employee>) employee -> {
System.out.printf("Processed item %s%n", employee.getId());
return employee;
})
.writer(list -> {
for (Employee item : list) {
//System.out.println(item);
}
})
.build();
}
@StepScope
@Bean
public FlatFileItemReader<Employee> flatFileItemReader() {
FlatFileItemReader<Employee> reader = new FlatFileItemReader<>();
reader.setResource(resourceFile);
DefaultLineMapper<Employee> lineMapper = new DefaultLineMapper<>();
lineMapper.setFieldSetMapper(fieldSet -> {
String[] values = fieldSet.getValues();
return Employee.builder()
.id(Integer.parseInt(values[0]))
.firstName(values[1])
.build();
});
lineMapper.setLineTokenizer(new DelimitedLineTokenizer(";"));
reader.setLineMapper(lineMapper);
reader.setCurrentItemCount(startLine);
reader.setMaxItemCount(lastLine);
startLine+=2;
lastLine+=2;
return reader;
}
@Bean
public PartitionHandler partitionHandler() {
TaskExecutorPartitionHandler taskExecutorPartitionHandler = new TaskExecutorPartitionHandler();
taskExecutorPartitionHandler.setTaskExecutor(taskExecutor());
taskExecutorPartitionHandler.setStep(slaveStep());
taskExecutorPartitionHandler.setGridSize(5);
return taskExecutorPartitionHandler;
}
@Bean
public TaskExecutor taskExecutor() {
ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
taskExecutor.setMaxPoolSize(5);
taskExecutor.setCorePoolSize(5);
taskExecutor.setQueueCapacity(5);
taskExecutor.afterPropertiesSet();
return taskExecutor;
}
}
我認為原則上我最初的想法是從 CSV 文件中讀取分區是錯誤的。 當很容易對要從中執行讀取的資源進行分區時,分區是有效的。
但是,將文件分區到一個線程將讀取特定范圍的記錄並不一定意味着任何性能提升:將一個大文件拆分為多個 InputStream(s) 以在 Java 中的多線程中處理
將多個線程用於批處理的一項改進是添加SimpleAsyncTaskExecutor
。 然后在單獨的線程中讀取、處理和寫入每個項目塊:
@Bean("MyStep1")
public Step step() {
return stepBuilderFactory.get("my step 1")
.<Employee, Employee>chunk(1)
.reader(flatFileItemReader())
.processor((ItemProcessor<Employee, Employee>) employee -> {
System.out.printf("Processed item %s%n", employee.getId());
return employee;
})
.writer(list -> {
for (Employee item : list) {
System.out.println(item);
}
})
.taskExecutor(new SimpleAsyncTaskExecutor("spring_batch"))
.build();
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.