Write Database View To Parquet File
Updated: Jun 1, 2023
In this example, you are going to learn how to generate a Parquet schema from a database view.
Check Parquet examples for more similar examples.
Input SQL file
user_information_view.sql
CREATE VIEW users_with_lastname_as_doe AS SELECT user_id, user_name, email, password, user_role_id FROM user WHERE last_name = 'Doe';
user_information.sql
CREATE TABLE user_role ( user_role_id INT NOT NULL, user_role VARCHAR(32) NOT NULL, PRIMARY KEY (user_role_id) ); CREATE TABLE user ( user_id INT NOT NULL, user_name VARCHAR(16) NOT NULL, first_name VARCHAR(255) NULL, last_name VARCHAR(255) NULL, email VARCHAR(255) NULL, password VARCHAR(255) NOT NULL, user_role_id INT NULL, PRIMARY KEY (user_id), FOREIGN KEY (user_role_id) REFERENCES user_role (user_role_id) ); INSERT INTO user_role (user_role_id, user_role) VALUES (1, 'Admin'), (2, 'Viewer'); INSERT INTO user (user_id, user_name, first_name, last_name, email, password, user_role_id) VALUES (1, 'john_doe', 'John', 'Doe', 'john.doe@example.com', '**********', 1), (2, 'jane_doe', 'jane', 'Doe', 'jane.doe@example.com', '**********', 1), (3, 'view_user', 'viewer', '', 'view_user@example.com', '**********', 2);
Java Code Listing
package com.northconcepts.datapipeline.examples.parquet;
import com.northconcepts.datapipeline.core.DataReader;
import com.northconcepts.datapipeline.core.StreamWriter;
import com.northconcepts.datapipeline.examples.database.DB;
import com.northconcepts.datapipeline.jdbc.JdbcReader;
import com.northconcepts.datapipeline.job.Job;
import com.northconcepts.datapipeline.parquet.ParquetDataReader;
import com.northconcepts.datapipeline.parquet.ParquetDataWriter;
import java.io.File;
import java.sql.Connection;
public class WriteDatabaseViewToParquetFile {
private static final String PARQUET_FILE = "example/data/output/WriteAParquetFileUsingSchemaFromDatabaseView.parquet";
public static void main(String[] args) {
DB db = new DB(); // creates HSQL DB
Connection connection = db.getConnection();
db.executeFile(new File("example/data/input/user_information.sql"));
db.executeFile(new File("example/data/input/user_information_view.sql"));
DataReader reader = new JdbcReader(connection, "SELECT * FROM USERS_WITH_LASTNAME_AS_DOE")
.setAutoCloseConnection(false);
ParquetDataWriter writer = new ParquetDataWriter(new File(PARQUET_FILE));
writer.setSchema(connection, null, "PUBLIC", "USERS_WITH_LASTNAME_AS_DOE");
System.out.println("===================Generated Parquet Schema========================");
System.out.println(writer.getSchema());
System.out.println("===================================================================");
Job.run(reader, writer);
System.out.println("=======================Reading Parquet File============================================");
reader = new ParquetDataReader(new File(PARQUET_FILE));
Job.run(reader, new StreamWriter(System.out));
}
}
Code walkthrough
- Create
DBinstance which will create HSQL DB. - Execute database script to create tables, view and insert records in the database.
- Create
JdbcReaderto read records from the specified query. - Create an instance of
ParquetDataWriterto write records to the Parquet file. - The connection, catalog name, schema name & table name (in this case the view "users_with_lastname_as_doe" created using database script) is passed to the method
writer.setSchema(). - Parquet Schema is generated using
writer.getSchema()method. Job.run(reader, writer)is used to transfer the data from thereaderto the writer. This will write the data to the Parquet file.
Console Output
===================Generated Parquet Schema========================
message schema {
optional int32 USER_ID (INTEGER(32,true));
optional binary USER_NAME (STRING);
optional binary EMAIL (STRING);
optional binary PASSWORD (STRING);
optional int32 USER_ROLE_ID (INTEGER(32,true));
}
===================================================================
15:06:38,608 DEBUG [main] datapipeline:615 - Job[1,job-1,Mon May 15 15:06:38 EAT 2023]::Start
15:06:43,781 DEBUG [main] datapipeline:661 - job::Success
=======================Reading Parquet File============================================
15:06:43,788 DEBUG [main] datapipeline:615 - Job[2,job-2,Mon May 15 15:06:43 EAT 2023]::Start
-----------------------------------------------
0 - Record (MODIFIED) {
0:[USER_ID]:INT=[1]:Integer
1:[USER_NAME]:STRING=[john_doe]:String
2:[EMAIL]:STRING=[john.doe@example.com]:String
3:[PASSWORD]:STRING=[**********]:String
4:[USER_ROLE_ID]:INT=[1]:Integer
}
-----------------------------------------------
1 - Record (MODIFIED) {
0:[USER_ID]:INT=[2]:Integer
1:[USER_NAME]:STRING=[jane_doe]:String
2:[EMAIL]:STRING=[jane.doe@example.com]:String
3:[PASSWORD]:STRING=[**********]:String
4:[USER_ROLE_ID]:INT=[1]:Integer
}
-----------------------------------------------
2 records
