orc
orc copied to clipboard
How to set list's offsets correctly
Greetings, I'm learning to work with ORC in C++, and I think I'm stuck and don't quite understand how to set array's offsets. Precisely, the following code, when executed, produces the following exception: "Caught exception in test-file.orc: bad read in nextBuffer" "):
void write_orc()
{
using namespace orc;
ORC_UNIQUE_PTR<OutputStream> outStream = writeLocalFile("test-file.orc");
ORC_UNIQUE_PTR<Type> schema(
Type::buildTypeFromString("struct<id:int,list1:array<string>>"));
WriterOptions options;
ORC_UNIQUE_PTR<Writer> writer = createWriter(*schema, outStream.get(), options);
std::unique_ptr<Writer> writer = createWriter(*type, stream.get(), options);
uint64_t batch_size = 1024, row_count = 2048;
std::unique_ptr<ColumnVectorBatch> batch =
writer->createRowBatch(row_count);
StructVectorBatch &root_batch =
dynamic_cast<StructVectorBatch &>(*batch.get());
LongVectorBatch &id_batch =
dynamic_cast<LongVectorBatch &>(*struct_batch.fields[0]);
ListVectorBatch &list_batch =
dynamic_cast<ListVectorBatch &>(*struct_batch.fields[1]);
StringVectorBatch &str_batch =
dynamic_cast<StringVectorBatch &>(*list_batch.elements.get());
std::vector<std::string> vs{"str1", "str2"};
char **data = str_batch.data.data();
int64_t *offsets = list_batch.offsets.data();
uint64_t offset = 0, rows = 0;
for (size_t i = 0; i < row_count; ++i) {
offsets[rows] = static_cast<int64_t>(offset);
id_batch.data[rows] = articles[i]->get_id();
for (auto &s : vs)
{
data[offset] = &s[0];
str_batch.length[offset++] = s.size();
}
rows++;
if (rows == batch_size)
{
root_batch.numElements = rows;
id_batch.numElements = rows;
list_batch.numElements = rows;
writer->add(*batch);
rows = 0;
offset = 0;
}
}
if (rows != 0)
{
root_batch.numElements = rows;
id_batch.numElements = rows;
list_batch.numElements = rows;
writer->add(*batch);
rows = 0;
offset = 0;
}
writer->close();
}
My question is: what exactly am I doing wrong when setting list's offsets?