001package com.astrolabsoftware.FinkBrowser.Parquet; 002 003import org.apache.hadoop.conf.Configuration; 004import org.apache.hadoop.fs.Path; 005import org.apache.parquet.hadoop.ParquetFileReader; 006import org.apache.parquet.hadoop.metadata.ParquetMetadata; 007import org.apache.parquet.format.converter.ParquetMetadataConverter; 008import org.apache.parquet.schema.MessageType; 009import org.apache.parquet.column.page.PageReadStore; 010import org.apache.parquet.io.MessageColumnIO; 011import org.apache.parquet.io.ColumnIOFactory; 012import org.apache.parquet.io.RecordReader; 013import org.apache.parquet.example.data.simple.convert.GroupRecordConverter; 014import org.apache.parquet.example.data.Group; 015import org.apache.parquet.example.data.simple.SimpleGroup; 016import org.apache.parquet.schema.GroupType; 017 018public class Test { 019 020 public static void main(String[] args) { 021 readParquetFile(); 022 } 023 024 private static void readParquetFile() { 025 Configuration conf = new Configuration(); 026 Path path = new Path("/user/hrivnac/2021_05_01_part-00001-25a8dbcc-1a3c-428b-9eeb-087566a78bbd.c000.snappy.parquet"); 027 try { 028 ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER); 029 MessageType schema = readFooter.getFileMetaData().getSchema(); 030 ParquetFileReader r = new ParquetFileReader(conf, path, readFooter); 031 PageReadStore pages = null; 032 while (null != (pages = r.readNextRowGroup())) { 033 final long rows = pages.getRowCount(); 034 System.out.println("Number of rows: " + rows); 035 final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); 036 final RecordReader<Group> recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema)); 037 String sTemp = ""; 038 Group g; 039 SimpleGroup sg; 040 GroupType type; 041 int n; 042 while ((g = recordReader.read()) != null) { 043 if (g instanceof SimpleGroup) { 044 sg = (SimpleGroup)g; 045 type = sg.getType(); 046 n = type.getFieldCount(); 047 for (int i = 0; i < n; i++) { 048 System.out.println(type.getFieldName(i) + " " + g.getFieldRepetitionCount(i) + " " + type.getType(i)); 049 //System.out.println(g.getString(i, 0)); 050 } 051 } 052 else { 053 System.out.println(g.getClass()); 054 } 055 } 056 } 057 } 058 catch (Exception e) { 059 e.printStackTrace(); 060 } 061 } 062 063 }