PooledArrays.jl icon indicating copy to clipboard operation
PooledArrays.jl copied to clipboard

improve median for pooled vectors

Open bkamins opened this issue 4 years ago • 1 comments

If length of pool is much smaller than the number of entries we can run the following (working code):

function median_fast(x::PooledVector)
    n = length(x)
    p = sortperm(x.pool)
    counts = zeros(Int, length(p))
    for v in x.refs
        counts[v] += 1
    end
    cum = 0
    for (j,i) in enumerate(p)
        cum += counts[i]
        if isodd(n)
            if cum >= div(n + 1, 2)
                return middle(x.pool[i])
            end
        else
            if cum >= div(n, 2)
                if cum == div(n, 2)
                    return middle(x.pool[i], x.pool[p[j+1]])
                else
                    return middle(x.pool[i])
                end
            end
        end
    end
    error("unreachable reached")
end

Example timings:

julia> x = PooledArray(rand(1:100, 10^8));

julia> @time median_fast(x);
  0.063734 seconds (4 allocations: 1.781 KiB)

julia> @time median_fast(x);
  0.070556 seconds (4 allocations: 1.781 KiB)

julia> @time median(x);
  0.931585 seconds (3 allocations: 762.940 MiB, 9.86% gc time)

julia> @time median(x);
  0.955555 seconds (3 allocations: 762.940 MiB, 14.29% gc time)

bkamins avatar Oct 15 '21 12:10 bkamins

Cool. BTW, it would also make sense to define an optimized method for sort like in CategoricalArrays: https://github.com/JuliaData/CategoricalArrays.jl/blob/058ee8bd73e32f947363978ca061f69f4a6328af/src/array.jl#L1059-L1092

nalimilan avatar Oct 15 '21 12:10 nalimilan