[英]Creating BigQuery table column with BIGNUMERIC data type using Java
[英]BigQuery Read Storage API with ARROW format appending 0s for NUMERIC and BIGNUMERIC data
我嘗試了 Google 文檔中的代碼以實現 Read Storage API 實現。 但是 Numeric 和 BigNumeric 列返回時附加了 0。
例如:我的表有數字數據 123,下面的代碼返回如下: Schema<numeric_datatype: Decimal(38, 9, 128)> numeric_datatype 123000000000.000000000
使用的代碼: https://cloud.google.com/bigquery/docs/reference/storage/libraries
請幫助理解和解決問題。
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.VectorLoader;
import org.apache.arrow.vector.ipc.ReadChannel;
import org.apache.arrow.vector.ipc.message.MessageSerializer;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
import com.google.cloud.bigquery.storage.v1.ArrowRecordBatch;
import com.google.cloud.bigquery.storage.v1.ArrowSchema;
import com.google.cloud.bigquery.storage.v1.BigQueryReadClient;
import com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest;
import com.google.cloud.bigquery.storage.v1.DataFormat;
import com.google.cloud.bigquery.storage.v1.ReadRowsRequest;
import com.google.cloud.bigquery.storage.v1.ReadRowsResponse;
import com.google.cloud.bigquery.storage.v1.ReadSession;
import com.google.cloud.bigquery.storage.v1.ReadSession.TableModifiers;
import com.google.cloud.bigquery.storage.v1.ReadSession.TableReadOptions;
import com.google.protobuf.Timestamp;
import com.google.api.gax.rpc.ServerStream;
public class StorageArrowExample {
private static class SimpleRowReader implements AutoCloseable {
BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
private final VectorSchemaRoot root;
private final VectorLoader loader;
public SimpleRowReader(ArrowSchema arrowSchema) throws IOException {
Schema schema = MessageSerializer.deserializeSchema(new ReadChannel(
new ByteArrayReadableSeekableByteChannel(arrowSchema.getSerializedSchema().toByteArray())));
System.out.println(schema);
Preconditions.checkNotNull(schema);
List<FieldVector> vectors = new ArrayList<>();
for (Field field : schema.getFields()) {
vectors.add(field.createVector(allocator));
}
root = new VectorSchemaRoot(vectors);
root.syncSchema();
loader = new VectorLoader(root);
}
public void processRows(ArrowRecordBatch batch) throws IOException {
org.apache.arrow.vector.ipc.message.ArrowRecordBatch deserializedBatch = MessageSerializer
.deserializeRecordBatch(new ReadChannel(
new ByteArrayReadableSeekableByteChannel(batch.getSerializedRecordBatch().toByteArray())),
allocator);
System.out.println(deserializedBatch);
loader.load(deserializedBatch);
// Release buffers from batch (they are still held in the vectors in root).
deserializedBatch.close();
String test = root.contentToTSVString();
System.out.println(root.contentToTSVString());
root.clear();
}
@Override
public void close() throws Exception {
// TODO Auto-generated method stub
}
}
public static void main(String... args) throws Exception {
String projectId = "****";
String table = "****";
String dataset = "****";
Integer snapshotMillis = null;
if (args.length > 1) {
snapshotMillis = Integer.parseInt(args[1]);
}
try (BigQueryReadClient client = BigQueryReadClient.create()) {
String parent = String.format("projects/%s", projectId);
String srcTable = String.format("projects/%s/datasets/%s/tables/%s", projectId, table, dataset);
TableReadOptions options = TableReadOptions.newBuilder().addSelectedFields("numeric_datatype")
.addSelectedFields("bignumeric_datatype").clearArrowSerializationOptions().build();
ReadSession.Builder sessionBuilder = ReadSession.newBuilder().setTable(srcTable)
.setDataFormat(DataFormat.ARROW).setReadOptions(options);
// Optionally specify the snapshot time. When unspecified, snapshot time is
// "now".
if (snapshotMillis != null) {
Timestamp t = Timestamp.newBuilder().setSeconds(snapshotMillis / 1000)
.setNanos((int) ((snapshotMillis % 1000) * 1000000)).build();
TableModifiers modifiers = TableModifiers.newBuilder().setSnapshotTime(t).build();
sessionBuilder.setTableModifiers(modifiers);
}
// Begin building the session creation request.
CreateReadSessionRequest.Builder builder = CreateReadSessionRequest.newBuilder().setParent(parent)
.setReadSession(sessionBuilder).setMaxStreamCount(1);
ReadSession session = client.createReadSession(builder.build());
// Setup a simple reader and start a read session.
try (SimpleRowReader reader = new SimpleRowReader(session.getArrowSchema())) {
Preconditions.checkState(session.getStreamsCount() > 0);
String streamName = session.getStreams(0).getName();
ReadRowsRequest readRowsRequest = ReadRowsRequest.newBuilder().setReadStream(streamName).build();
ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
for (ReadRowsResponse response : stream) {
Preconditions.checkState(response.hasArrowRecordBatch());
reader.processRows(response.getArrowRecordBatch());
}
}
}
}
}
使用您的代碼我無法重現該問題。
創建測試表:
CREATE or REPLACE table abc.num
AS SELECT CAST(123 as numeric) as numeric_datatype,
CAST(123 as bignumeric) as bignumeric_datatype
然后運行您的代碼(在粘貼的代碼中生成表 URL 交換數據集和表)我得到:
ArrowRecordBatch [length=1, nodes=[ArrowFieldNode [length=1, nullCount=0], ArrowFieldNode [length=1, nullCount=0]], #buffers=4, buffersLayout=[ArrowBuffer [offset=0, size=0], ArrowBuffer [offset=0, size=16], ArrowBuffer [offset=16, size=0], ArrowBuffer [offset=16, size=32]], closed=false]
numeric_datatype bignumeric_datatype
123.000000000 123.00000000000000000000000000000000000000
小數點后的額外數字是預期的,因為 numeric 的小數位數是 9,而 bignumeric 的小數位數是 38,這意味着完整值在邏輯上包括這些值。 contentToTSVString
只是在從 DecimalVector/Decimal256Vector 返回的 BigDecimal 上調用 toString。 如果要刪除小數位,可以調用 DecimalVector 的getObject
並在打印值之前調用stripTrailingZeros
。
相關依賴:
<dependency>
<groupId>com.google.api.grpc</groupId>
<artifactId>grpc-google-cloud-bigquerystorage-v1</artifactId>
<version>2.12.0</version>
</dependency>
<dependency>
<groupId>com.google.api.grpc</groupId>
<artifactId>proto-google-cloud-bigquerystorage-v1</artifactId>
<version>2.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-memory-netty</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-bigquerystorage</artifactId>
<version>2.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-vector</artifactId>
<version>6.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.api/gax -->
<dependency>
<groupId>com.google.api</groupId>
<artifactId>gax</artifactId>
<version>2.12.2</version>
</dependency>
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.