orc icon indicating copy to clipboard operation
orc copied to clipboard

How to set list's offsets correctly

Open cubter opened this issue 3 years ago • 0 comments

Greetings, I'm learning to work with ORC in C++, and I think I'm stuck and don't quite understand how to set array's offsets. Precisely, the following code, when executed, produces the following exception: "Caught exception in test-file.orc: bad read in nextBuffer" "):

void write_orc()
{
    using namespace orc;

    ORC_UNIQUE_PTR<OutputStream> outStream = writeLocalFile("test-file.orc");
    ORC_UNIQUE_PTR<Type> schema(
        Type::buildTypeFromString("struct<id:int,list1:array<string>>"));
    WriterOptions options;
    ORC_UNIQUE_PTR<Writer> writer = createWriter(*schema, outStream.get(), options);

    std::unique_ptr<Writer> writer = createWriter(*type, stream.get(), options);

    uint64_t batch_size = 1024, row_count = 2048;

    std::unique_ptr<ColumnVectorBatch> batch =
        writer->createRowBatch(row_count);
    StructVectorBatch &root_batch =
        dynamic_cast<StructVectorBatch &>(*batch.get());
    LongVectorBatch &id_batch =
        dynamic_cast<LongVectorBatch &>(*struct_batch.fields[0]);
    ListVectorBatch &list_batch =
        dynamic_cast<ListVectorBatch &>(*struct_batch.fields[1]);
    StringVectorBatch &str_batch =
        dynamic_cast<StringVectorBatch &>(*list_batch.elements.get());
    
    std::vector<std::string> vs{"str1", "str2"};

    char **data         = str_batch.data.data();
    int64_t *offsets    = list_batch.offsets.data();
    uint64_t offset     = 0, rows = 0;
    for (size_t i = 0; i < row_count; ++i) {
        offsets[rows] = static_cast<int64_t>(offset);

        id_batch.data[rows] = articles[i]->get_id();

        for (auto &s : vs)
        {
            data[offset] = &s[0];
            str_batch.length[offset++] = s.size();
        }

        rows++;
        if (rows == batch_size) 
        {
            root_batch.numElements = rows;
            id_batch.numElements   = rows;
            list_batch.numElements = rows;

            writer->add(*batch);
            rows = 0;
            offset = 0;
        }
    }

    if (rows != 0) 
    {
        root_batch.numElements = rows;
        id_batch.numElements   = rows;
        list_batch.numElements = rows;

        writer->add(*batch);
        rows = 0;
        offset = 0;
    }

    writer->close();
}

My question is: what exactly am I doing wrong when setting list's offsets?

cubter avatar Aug 14 '22 17:08 cubter