parquet-testing
parquet-testing copied to clipboard
GH-43745: Add lz4-hadoop compressed parquet file with multiple blocks
This is the data to reproduce https://github.com/apache/arrow/issues/43745.
The parquet file is generated with this script
import org.apache.hadoop.fs.Path;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.GroupFactory;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
import java.io.IOException;
import java.util.UUID;
public class ParquetGenerator {
public static void main(String... args) {
MessageType schema = MessageTypeParser.parseMessageType(
"message example {\n" +
" required binary a (STRING);\n" +
"}"
);
GroupFactory factory = new SimpleGroupFactory(schema);
ParquetWriter<Group> writer;
try {
writer = ExampleParquetWriter.builder(new Path("parquet-testing/data/hadoop_lz4_compressed_block_split.parquet"))
.withWriteMode(ParquetFileWriter.Mode.CREATE)
.withCompressionCodec(CompressionCodecName.LZ4)
.config("io.compression.codec.lz4.buffersize", "65536")
.withType(schema)
.build();
for (int i = 0; i < 65536; i++) {
Group group = factory.newGroup();
group.add("a", UUID.randomUUID().toString());
writer.write(group);
}
writer.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}