diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java index c9842c9375..19b1d5426e 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java @@ -294,6 +294,14 @@ private static Map> toSetMultiMap(Map map) { return Collections.unmodifiableMap(setMultiMap); } + /** + * Returns the 0-based index of the row group currently being read. Returns -1 if no row group + * has been read yet. + */ + public int getCurrentRowGroupIndex() { + return currentBlock; + } + /** * Returns the row index of the current row. If no row has been processed or if the * row index information is unavailable from the underlying @{@link PageReadStore}, returns -1. diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 551b1bf6c7..e0b0d76e0e 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -1097,6 +1097,14 @@ public List getRowGroups() { return blocks; } + /** + * Returns the 0-based index of the row group that was last read via {@link #readNextRowGroup()} + * or {@link #readNextFilteredRowGroup()}. Returns -1 if no row group has been read yet. + */ + public int getCurrentRowGroupIndex() { + return currentBlock - 1; + } + public void setRequestedSchema(List columns) { paths.clear(); for (ColumnDescriptor col : columns) { diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java index 4514a829c5..01ac69b330 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java @@ -144,6 +144,17 @@ public T read() throws IOException { } } + /** + * @return the 0-based index of the row group currently being read. If no row group has been + * read yet, returns -1. + */ + public int getCurrentRowGroupIndex() { + if (reader == null) { + return -1; + } + return reader.getCurrentRowGroupIndex(); + } + /** * @return the row index of the last read row. If no row has been processed, returns -1. */ diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetRecordReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetRecordReader.java index b217116aac..c0e52fc5c6 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetRecordReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetRecordReader.java @@ -207,6 +207,14 @@ public boolean nextKeyValue() throws IOException, InterruptedException { return internalReader.nextKeyValue(); } + /** + * @return the 0-based index of the row group currently being read. If no row group has been + * read yet, returns -1. + */ + public int getCurrentRowGroupIndex() { + return internalReader.getCurrentRowGroupIndex(); + } + /** * @return the row index of the current row. If no row has been processed, returns -1. */ diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetReader.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetReader.java index 4a4157e7af..b1a9bf1e31 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetReader.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetReader.java @@ -22,6 +22,7 @@ import static org.apache.parquet.filter2.predicate.FilterApi.longColumn; import static org.apache.parquet.hadoop.ParquetFileWriter.Mode.OVERWRITE; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.io.IOException; import java.net.URISyntaxException; @@ -201,6 +202,39 @@ public void testCurrentRowIndex() throws Exception { assertEquals(reader.getCurrentRowIndex(), -1); } + @Test + public void testCurrentRowGroupIndex() throws Exception { + int expectedRowGroups; + try (ParquetFileReader fileReader = + ParquetFileReader.open(HadoopInputFile.fromPath(file, new Configuration()))) { + expectedRowGroups = fileReader.getRowGroups().size(); + } + assertTrue("expected multiple row groups for this test", expectedRowGroups > 1); + + try (ParquetReader reader = + PhoneBookWriter.createReader(file, FilterCompat.NOOP, allocator)) { + // before reading anything, returns -1 + assertEquals(-1, reader.getCurrentRowGroupIndex()); + + reader.read(); + assertEquals(0, reader.getCurrentRowGroupIndex()); + // idempotent + assertEquals(0, reader.getCurrentRowGroupIndex()); + + int prevIdx = 0; + while (reader.read() != null) { + int idx = reader.getCurrentRowGroupIndex(); + assertTrue(idx >= prevIdx); + assertTrue(idx <= prevIdx + 1); + prevIdx = idx; + } + // last row group seen should be the final one + assertEquals(expectedRowGroups - 1, prevIdx); + // after exhaustion, returns -1 + assertEquals(-1, reader.getCurrentRowGroupIndex()); + } + } + @Test public void testRangeFiltering() throws Exception { // The readUsers also validates the rowIndex for each returned row.