Read Selected Fields from a Parquet File

Updated: Feb 21, 2022
/*
 * Copyright (c) 2006-2022 North Concepts Inc.  All rights reserved.
 * Proprietary and Confidential.  Use is subject to license terms.
 * 
 * https://northconcepts.com/data-pipeline/licensing/
 */
package com.northconcepts.datapipeline.examples.parquet;

import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;

import java.io.File;

import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Types;

import com.northconcepts.datapipeline.core.StreamWriter;
import com.northconcepts.datapipeline.job.Job;
import com.northconcepts.datapipeline.parquet.ParquetDataReader;

public class ReadSelectedFieldsFromAParquetFile {

    public static void main(String[] args) {
        // Prepare schema with fields to be read from file, ignoring other fields
        MessageType schema = new MessageType("input_schema",
                Types.optional(INT32).named("id"),
                Types.optional(INT32).named("int_col"),
                Types.optional(DOUBLE).named("double_col"),
                Types.optional(BINARY).as(stringType()).named("date_string_col"),
                Types.optional(BOOLEAN).named("bool_col")
                );
/* 
       Exclude the following fields from being read:
            optional int32 tinyint_col;
            optional int32 smallint_col;
            optional int64 bigint_col;
            optional float float_col;
            optional binary string_col;
            optional int96 timestamp_col;
*/
        
        ParquetDataReader reader = new ParquetDataReader(new File("example/data/input/read_parquet_file.parquet"))
                .setSchema(schema)  // Remove this line to see all fields (and in original arrangement)
                ;
        Job.run(reader, new StreamWriter(System.out));

        System.out.println(reader.getSchema());
    }
}

Mobile Analytics