data · 2022-07-17 0

java 读写 orc

一、概述

ORC的全称是(Optimized Record Columnar),优化后的列式记录,ORC在RCFile的基础上进行了一定的改进

二、maven引入

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>2.7.7</version>
</dependency>

<dependency>
    <groupId>org.apache.orc</groupId>
    <artifactId>orc-core</artifactId>
    <version>1.5.4</version>
</dependency>

三、写入 orc

    @Test
    public void write() throws IOException {
        Configuration conf = new Configuration();
        TypeDescription schema = TypeDescription.createStruct()
                .addField("long_value", TypeDescription.createLong())
                .addField("double_value", TypeDescription.createDouble())
                .addField("boolean_value", TypeDescription.createBoolean())
                .addField("string_value", TypeDescription.createString());

        Writer writer = OrcFile.createWriter(new Path("my-file.orc"),
                OrcFile.writerOptions(conf).setSchema(schema));

        VectorizedRowBatch batch = schema.createRowBatch();
        LongColumnVector longVector = (LongColumnVector) batch.cols[0];
        DoubleColumnVector doubleVector = (DoubleColumnVector) batch.cols[1];
        LongColumnVector booleanVector = (LongColumnVector) batch.cols[2];
        BytesColumnVector stringVector = (BytesColumnVector) batch.cols[3];

        for (int r = 0; r < 10; ++r) {
            int row = batch.size++;

            longVector.vector[row] = r;
            doubleVector.vector[row] = r;
            booleanVector.vector[row] = r < 5 ? 1 : 0;
            stringVector.setVal(row, UUID.randomUUID().toString().getBytes());
        }

        if (batch.size != 0) {
            writer.addRowBatch(batch);
            batch.reset();
        }

        writer.close();
    }

四、读取 orc

    @Test
    public void read() throws IOException {
        Configuration conf = new Configuration();
        TypeDescription schema = TypeDescription.createStruct()
                .addField("long_value", TypeDescription.createLong())
                .addField("double_value", TypeDescription.createDouble())
                .addField("boolean_value", TypeDescription.createBoolean())
                .addField("string_value", TypeDescription.createString());

        Reader reader = OrcFile.createReader(new Path("my-file.orc"),
                OrcFile.readerOptions(conf));

        Reader.Options readerOptions = new Reader.Options(conf)
                .searchArgument(
                        SearchArgumentFactory
                                .newBuilder()
                                .between("long_value", PredicateLeaf.Type.LONG, 0L, 6L)
                                .build(),
                        new String[]{"long_value"}
                );

        RecordReader rows = reader.rows(readerOptions.schema(schema));

        VectorizedRowBatch batch = schema.createRowBatch();

        while (rows.nextBatch(batch)) {
            LongColumnVector longVector = (LongColumnVector) batch.cols[0];
            DoubleColumnVector doubleVector  = (DoubleColumnVector) batch.cols[1];
            LongColumnVector booleanVector = (LongColumnVector) batch.cols[2];
            BytesColumnVector stringVector = (BytesColumnVector)  batch.cols[3];

            for(int r=0; r < batch.size; r++) {
                long longValue = longVector.vector[r];
                double doubleValue = doubleVector.vector[r];
                boolean boolValue = booleanVector.vector[r] != 0;
                String stringValue = stringVector.toString(r);

                System.out.println(longValue + ", " + doubleValue + ", " + boolValue + ", " + stringValue);

            }
        }
        rows.close();
    }