一、概述
ORC的全称是(Optimized Record Columnar),优化后的列式记录,ORC在RCFile的基础上进行了一定的改进
二、maven引入
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.orc</groupId>
<artifactId>orc-core</artifactId>
<version>1.5.4</version>
</dependency>
三、写入 orc
@Test
public void write() throws IOException {
Configuration conf = new Configuration();
TypeDescription schema = TypeDescription.createStruct()
.addField("long_value", TypeDescription.createLong())
.addField("double_value", TypeDescription.createDouble())
.addField("boolean_value", TypeDescription.createBoolean())
.addField("string_value", TypeDescription.createString());
Writer writer = OrcFile.createWriter(new Path("my-file.orc"),
OrcFile.writerOptions(conf).setSchema(schema));
VectorizedRowBatch batch = schema.createRowBatch();
LongColumnVector longVector = (LongColumnVector) batch.cols[0];
DoubleColumnVector doubleVector = (DoubleColumnVector) batch.cols[1];
LongColumnVector booleanVector = (LongColumnVector) batch.cols[2];
BytesColumnVector stringVector = (BytesColumnVector) batch.cols[3];
for (int r = 0; r < 10; ++r) {
int row = batch.size++;
longVector.vector[row] = r;
doubleVector.vector[row] = r;
booleanVector.vector[row] = r < 5 ? 1 : 0;
stringVector.setVal(row, UUID.randomUUID().toString().getBytes());
}
if (batch.size != 0) {
writer.addRowBatch(batch);
batch.reset();
}
writer.close();
}
四、读取 orc
@Test
public void read() throws IOException {
Configuration conf = new Configuration();
TypeDescription schema = TypeDescription.createStruct()
.addField("long_value", TypeDescription.createLong())
.addField("double_value", TypeDescription.createDouble())
.addField("boolean_value", TypeDescription.createBoolean())
.addField("string_value", TypeDescription.createString());
Reader reader = OrcFile.createReader(new Path("my-file.orc"),
OrcFile.readerOptions(conf));
Reader.Options readerOptions = new Reader.Options(conf)
.searchArgument(
SearchArgumentFactory
.newBuilder()
.between("long_value", PredicateLeaf.Type.LONG, 0L, 6L)
.build(),
new String[]{"long_value"}
);
RecordReader rows = reader.rows(readerOptions.schema(schema));
VectorizedRowBatch batch = schema.createRowBatch();
while (rows.nextBatch(batch)) {
LongColumnVector longVector = (LongColumnVector) batch.cols[0];
DoubleColumnVector doubleVector = (DoubleColumnVector) batch.cols[1];
LongColumnVector booleanVector = (LongColumnVector) batch.cols[2];
BytesColumnVector stringVector = (BytesColumnVector) batch.cols[3];
for(int r=0; r < batch.size; r++) {
long longValue = longVector.vector[r];
double doubleValue = doubleVector.vector[r];
boolean boolValue = booleanVector.vector[r] != 0;
String stringValue = stringVector.toString(r);
System.out.println(longValue + ", " + doubleValue + ", " + boolValue + ", " + stringValue);
}
}
rows.close();
}