[英]Spring-batch partitioned step duplicated processing
我有数千条记录要使用 spring-batch 处理,这需要太多时间来满足我们的业务需求。 其逻辑很简单,就是逐条读取CSV文件并进行处理。
我想并行化步骤执行以加快批处理速度。 我认为最好的选择是对步骤进行分区并执行假设在一个线程中执行 1-100 条记录,在另一个线程中执行 101-200 条记录,依此类推。 但相反,每个线程都处理所有记录。
同样的情况也发生在这个非常简单的例子中:
@Configuration
@RequiredArgsConstructor
public class JobConfig {
private final JobBuilderFactory jobBuilderFactory;
private final StepBuilderFactory stepBuilderFactory;
@Value("classpath:employees.csv")
private Resource resourceFile;
@Bean("MyJob1")
public Job createJob() {
return jobBuilderFactory.get("my job 1")
.incrementer(new RunIdIncrementer())
.start(step())
.build();
}
@Bean("MyStep1")
public Step step() {
return stepBuilderFactory.get("my step 1")
.partitioner("my step 1", new SimplePartitioner())
.partitionHandler(partitionHandler())
.build();
}
@Bean("slaveStep")
public Step slaveStep() {
return stepBuilderFactory.get("read csv stream")
.<Employee, Employee>chunk(1)
.reader(flatFileItemReader())
.processor((ItemProcessor<Employee, Employee>) employee -> {
System.out.printf("Processed item %s%n", employee.getId());
return employee;
})
.writer(list -> {
for (Employee item : list) {
System.out.println(item);
}
})
.build();
}
@StepScope
@Bean
public FlatFileItemReader<Employee> flatFileItemReader() {
FlatFileItemReader<Employee> reader = new FlatFileItemReader<>();
reader.setResource(resourceFile);
DefaultLineMapper<Employee> lineMapper = new DefaultLineMapper<>();
lineMapper.setFieldSetMapper(fieldSet -> {
String[] values = fieldSet.getValues();
return Employee.builder()
.id(Integer.parseInt(values[0]))
.firstName(values[1])
.build();
});
lineMapper.setLineTokenizer(new DelimitedLineTokenizer(";"));
reader.setLineMapper(lineMapper);
return reader;
}
@Bean
public PartitionHandler partitionHandler() {
TaskExecutorPartitionHandler taskExecutorPartitionHandler = new TaskExecutorPartitionHandler();
taskExecutorPartitionHandler.setTaskExecutor(taskExecutor());
taskExecutorPartitionHandler.setStep(slaveStep());
taskExecutorPartitionHandler.setGridSize(5);
return taskExecutorPartitionHandler;
}
@Bean
public TaskExecutor taskExecutor() {
ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
taskExecutor.setMaxPoolSize(5);
taskExecutor.setCorePoolSize(5);
taskExecutor.setQueueCapacity(5);
taskExecutor.afterPropertiesSet();
return taskExecutor;
}
}
@SpringBootApplication
@EnableBatchProcessing
public class SpringBatchTestsApplication implements CommandLineRunner {
private final JobLauncher jobLauncher;
private final Job job;
public SpringBatchTestsApplication(JobLauncher jobLauncher,
@Qualifier("MyJob1") Job job) {
this.jobLauncher = jobLauncher;
this.job = job;
}
public static void main(String[] args) {
SpringApplication.run(SpringBatchTestsApplication.class, args);
}
@Override
public void run(String... args) throws Exception {
jobLauncher.run(job, new JobParameters());
}
}
@Value
@Builder
public class Employee {
private final int id;
private final String firstName;
}
员工。csv:
1;Jakub
2;Mike
3;Pawel
4;Joana
5;Michal
6;Joe
7;Bailey
8;Bailhache
9;John
10;Eva
示例(示例因为顺序不重要)预期为 5 个线程(gridSize)的 output:
Processed item 1
Employee(id=1, firstName=Jakub)
Processed item 2
Employee(id=2, firstName=Mike)
Processed item 3
Employee(id=3, firstName=Pawel)
Processed item 4
Employee(id=4, firstName=Joana)
Processed item 5
Employee(id=5, firstName=Michal)
Processed item 6
Employee(id=6, firstName=Joe)
Processed item 7
Employee(id=7, firstName=Bailey)
Processed item 8
Employee(id=8, firstName=Bailhache)
Processed item 9
Employee(id=9, firstName=John)
Processed item 10
Employee(id=10, firstName=Eva)
实际的output是以上出现5次。
这应该是查看分区器的解决方案,他将负责计算文件的行数并将它们分配给读者,并使用@Value("#{stepExecutionContext['fromLine']}")
注入读者要阅读的正确行。
请使用注入而不是调用方法。
@Configuration
@RequiredArgsConstructor
public class JobConfig {
private static final Logger log = LoggerFactory.getLogger(JobConfig.class);
private final JobBuilderFactory jobBuilderFactory;
private final StepBuilderFactory stepBuilderFactory;
@Value(value = "classpath:employees.csv")
private Resource resource;
@Bean("MyJob1")
public Job createJob(@Qualifier("MyStep1") Step stepMaster) {
return jobBuilderFactory.get("MyJob1")
.incrementer(new RunIdIncrementer())
.start(stepMaster)
.build();
}
@Bean("MyStep1")
public Step step(PartitionHandler partitionHandler, Partitioner partitioner) {
return stepBuilderFactory.get("MyStep1")
.partitioner("slaveStep", partitioner)
.partitionHandler(partitionHandler)
.build();
}
@Bean("slaveStep")
public Step slaveStep(FlatFileItemReader<Employee> reader) {
return stepBuilderFactory.get("slaveStep")
.<Employee, Employee>chunk(1)
.reader(reader)
.processor((ItemProcessor<Employee, Employee>) employee -> {
System.out.printf("Processed item %s%n", employee.getId());
return employee;
})
.writer(list -> {
for (Employee item : list) {
System.out.println(item);
}
})
.build();
}
@Bean
public Partitioner partitioner() {
return gridSize -> {
Map<String, ExecutionContext> result = new HashMap<>();
int lines = 0;
try(BufferedReader reader = new BufferedReader(new InputStreamReader(resource.getInputStream()))) {
while (reader.readLine() != null) lines++;
} catch (IOException e) {
throw new RuntimeException(e);
}
int range = lines / gridSize;
int remains = lines % gridSize;
int fromLine = 0;
int toLine = range;
for (int i = 1; i <= gridSize; i++) {
if(i == gridSize) {
toLine += remains;
}
ExecutionContext value = new ExecutionContext();
value.putInt("fromLine", fromLine);
value.putInt("toLine", toLine);
fromLine = toLine;
toLine += range;
result.put("partition" + i, value);
}
return result;
};
}
@StepScope
@Bean
public FlatFileItemReader<Employee> flatFileItemReader(@Value("#{stepExecutionContext['fromLine']}") int startLine, @Value("#{stepExecutionContext['toLine']}") int lastLine) {
FlatFileItemReader<Employee> reader = new FlatFileItemReader<>();
reader.setResource(resource);
DefaultLineMapper<Employee> lineMapper = new DefaultLineMapper<>();
lineMapper.setFieldSetMapper(fieldSet -> {
String[] values = fieldSet.getValues();
return Employee.builder()
.id(Integer.parseInt(values[0]))
.firstName(values[1])
.build();
});
lineMapper.setLineTokenizer(new DelimitedLineTokenizer(";"));
reader.setLineMapper(lineMapper);
reader.setCurrentItemCount(startLine);
reader.setMaxItemCount(lastLine);
return reader;
}
@Bean
public PartitionHandler partitionHandler(@Qualifier("slaveStep") Step step, TaskExecutor taskExecutor) {
TaskExecutorPartitionHandler taskExecutorPartitionHandler = new TaskExecutorPartitionHandler();
taskExecutorPartitionHandler.setTaskExecutor(taskExecutor);
taskExecutorPartitionHandler.setStep(step);
taskExecutorPartitionHandler.setGridSize(5);
return taskExecutorPartitionHandler;
}
@Bean
public TaskExecutor taskExecutor() {
ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
taskExecutor.setMaxPoolSize(5);
taskExecutor.setCorePoolSize(5);
taskExecutor.setQueueCapacity(5);
taskExecutor.afterPropertiesSet();
return taskExecutor;
}
}
我不知道我的解决方案是否是最好的,但它是有效的。
我在JobConfig
class 中添加了两个全局参数:
private int startLine = -1;
private int lastLine = 1;
然后我在flatFileItemReader
方法中将这些参数设置为 reader:
reader.setCurrentItemCount(startLine);
reader.setMaxItemCount(lastLine);
startLine+=2;
lastLine+=2;
所有代码:
@Configuration
@RequiredArgsConstructor
public class JobConfig {
private final JobBuilderFactory jobBuilderFactory;
private final StepBuilderFactory stepBuilderFactory;
private int startLine = -1;
private int lastLine = 1;
@Value("classpath:employees.csv")
private Resource resourceFile;
@Bean("MyJob1")
public Job createJob() {
return jobBuilderFactory.get("my job 1")
.incrementer(new RunIdIncrementer())
.start(step())
.build();
}
@Bean("MyStep1")
public Step step() {
return stepBuilderFactory.get("my step 1")
.partitioner("my step 1", new SimplePartitioner())
.partitionHandler(partitionHandler())
.build();
}
@Bean("slaveStep")
public Step slaveStep() {
return stepBuilderFactory.get("read csv stream")
.<Employee, Employee>chunk(1)
.reader(flatFileItemReader())
.processor((ItemProcessor<Employee, Employee>) employee -> {
System.out.printf("Processed item %s%n", employee.getId());
return employee;
})
.writer(list -> {
for (Employee item : list) {
//System.out.println(item);
}
})
.build();
}
@StepScope
@Bean
public FlatFileItemReader<Employee> flatFileItemReader() {
FlatFileItemReader<Employee> reader = new FlatFileItemReader<>();
reader.setResource(resourceFile);
DefaultLineMapper<Employee> lineMapper = new DefaultLineMapper<>();
lineMapper.setFieldSetMapper(fieldSet -> {
String[] values = fieldSet.getValues();
return Employee.builder()
.id(Integer.parseInt(values[0]))
.firstName(values[1])
.build();
});
lineMapper.setLineTokenizer(new DelimitedLineTokenizer(";"));
reader.setLineMapper(lineMapper);
reader.setCurrentItemCount(startLine);
reader.setMaxItemCount(lastLine);
startLine+=2;
lastLine+=2;
return reader;
}
@Bean
public PartitionHandler partitionHandler() {
TaskExecutorPartitionHandler taskExecutorPartitionHandler = new TaskExecutorPartitionHandler();
taskExecutorPartitionHandler.setTaskExecutor(taskExecutor());
taskExecutorPartitionHandler.setStep(slaveStep());
taskExecutorPartitionHandler.setGridSize(5);
return taskExecutorPartitionHandler;
}
@Bean
public TaskExecutor taskExecutor() {
ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
taskExecutor.setMaxPoolSize(5);
taskExecutor.setCorePoolSize(5);
taskExecutor.setQueueCapacity(5);
taskExecutor.afterPropertiesSet();
return taskExecutor;
}
}
我认为原则上我最初的想法是从 CSV 文件中读取分区是错误的。 当很容易对要从中执行读取的资源进行分区时,分区是有效的。
但是,将文件分区到一个线程将读取特定范围的记录并不一定意味着任何性能提升:将一个大文件拆分为多个 InputStream(s) 以在 Java 中的多线程中处理
将多个线程用于批处理的一项改进是添加SimpleAsyncTaskExecutor
。 然后在单独的线程中读取、处理和写入每个项目块:
@Bean("MyStep1")
public Step step() {
return stepBuilderFactory.get("my step 1")
.<Employee, Employee>chunk(1)
.reader(flatFileItemReader())
.processor((ItemProcessor<Employee, Employee>) employee -> {
System.out.printf("Processed item %s%n", employee.getId());
return employee;
})
.writer(list -> {
for (Employee item : list) {
System.out.println(item);
}
})
.taskExecutor(new SimpleAsyncTaskExecutor("spring_batch"))
.build();
}
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.