matrixone icon indicating copy to clipboard operation
matrixone copied to clipboard

[Bug]: Improve Performance of "CREATE VECTOR INDEX" on GIST 960

Open arjunsk opened this issue 1 year ago • 5 comments

Is there an existing issue for the same bug?

  • [X] I have checked the existing issues.

Branch Name

main

Commit ID

6003d8346f0fed6e1fbd0563bc7bc2b1d2287741

Other Environment Information

- Hardware parameters:
- OS type:
- Others:

Actual Behavior

Create Index: 14mins (1min for create index, 12mins for mapping) Query: Recall: 0.5461, QPS: 2.8397

Expected Behavior

Create Index : should take at max 5mins Query: Improve the QPS numbers

Steps to Reproduce

1. Run `GIST 960` insert
2. Run QPS code

Additional information

arjunsk avatar May 10 '24 20:05 arjunsk

Steps to reproduce

  1. Load t5 table with VECF32(960)
  2. Run the below query
drop table if exists meta;
drop table if exists centroids;
drop table if exists entries;



create table `a`.`meta` (`__mo_index_key` VARCHAR(65535),`__mo_index_val` VARCHAR(65535), primary key ( __mo_index_key ) );

create table `a`.`centroids` (`__mo_index_centroid_version` BIGINT,`__mo_index_centroid_id` BIGINT,`__mo_index_centroid` VECF32(960), primary key ( __mo_index_centroid_version,__mo_index_centroid_id ) );

create table `a`.`entries` (`__mo_index_centroid_fk_version` BIGINT,`__mo_index_centroid_fk_id` BIGINT,`__mo_index_pri_col` INT,`__mo_index_centroid_fk_entry` VECF32(960), primary key ( __mo_index_centroid_fk_version,__mo_index_centroid_fk_id,__mo_index_pri_col ) );


insert into `a`.`meta` (`__mo_index_key`, `__mo_index_val`) values('version', '0')ON DUPLICATE KEY UPDATE `__mo_index_val` = CAST( (CAST(`__mo_index_val` AS BIGINT) + 1) AS CHAR);

insert into `a`.`centroids` (`__mo_index_centroid_version`, `__mo_index_centroid_id`, `__mo_index_centroid`) SELECT (SELECT CAST(`__mo_index_val` AS BIGINT) FROM `meta` WHERE `__mo_index_key` = 'version'), ROW_NUMBER() OVER(), cast(`__mo_index_unnest_cols`.`value` as VARCHAR) FROM (SELECT cluster_centers(`b` kmeans '500,vector_l2_ops,kmeansplusplus,true') AS `__mo_index_centroids_string` FROM (select sample(`b`, 10000 rows, 'row') as `b` from `a`.`t5`) ) AS `__mo_index_centroids_tbl` CROSS JOIN UNNEST(`__mo_index_centroids_tbl`.`__mo_index_centroids_string`) AS `__mo_index_unnest_cols`;


  1. Run the bottleneck query
explain analyze insert into `a`.`entries` (`__mo_index_centroid_fk_version`, `__mo_index_centroid_fk_id`, `__mo_index_pri_col`, `__mo_index_centroid_fk_entry`)  select `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version` , `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id` , `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col` , `t5`.`b`  from (select `t5`.`a` as `__mo_org_tbl_pk_may_serial_col`, `t5`.`b` from `a`.`t5`) as `t5` inner join (select `centroids`.`__mo_index_centroid_version` as `__mo_index_centroid_version`, serial_extract( min( serial_full( l2_distance(`centroids`.`__mo_index_centroid`, `t5`.`__mo_org_tbl_norm_vec_col`), `centroids`.`__mo_index_centroid_id`)), 1 as bigint)  as `__mo_index_joined_centroid_id`, `__mo_org_tbl_pk_may_serial_col`from (select `t5`.`a` as `__mo_org_tbl_pk_may_serial_col`, normalize_l2(`t5`.`b`) as `__mo_org_tbl_norm_vec_col`  from `a`.`t5`) as `t5` CROSS JOIN (select * from `a`.`centroids` where `__mo_index_centroid_version` = (select CAST(__mo_index_val as BIGINT) from `a`.`meta` where `__mo_index_key` = 'version'))  as `centroids` group by `__mo_index_centroid_version`, __mo_org_tbl_pk_may_serial_col ) as `__mo_index_tbl_join_centroids` on `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col` = `t5`.`__mo_org_tbl_pk_may_serial_col` order by `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version`, `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id` ;


-- ### SQL Pretty Print ### 

--
--INSERT INTO `a`.`entries` (`__mo_index_centroid_fk_version`, `__mo_index_centroid_fk_id`, `__mo_index_pri_col`, `__mo_index_centroid_fk_entry`)
--SELECT
--    `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version`,
--    `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id`,
--    `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col`,
--    `t5`.`b`
--FROM
--      (SELECT `t5`.`a` AS `__mo_org_tbl_pk_may_serial_col`, `t5`.`b` FROM `a`.`t5`) AS `t5`
--INNER JOIN
--      (
--        SELECT
--        `centroids`.`__mo_index_centroid_version` AS `__mo_index_centroid_version`,
--        serial_extract(min(serial_full(l2_distance(`centroids`.`__mo_index_centroid`, `t5`.`__mo_org_tbl_norm_vec_col`), `centroids`.`__mo_index_centroid_id`)), 1 AS bigint) AS `__mo_index_joined_centroid_id`,
--        `__mo_org_tbl_pk_may_serial_col`
--       FROM
--         (SELECT `t5`.`a` AS `__mo_org_tbl_pk_may_serial_col`,normalize_l2(`t5`.`b`) AS `__mo_org_tbl_norm_vec_col` FROM `a`.`t5`) AS `t5`
--            CROSS JOIN
--         (SELECT * FROM `a`.`centroids` WHERE `__mo_index_centroid_version` = (SELECT CAST(__mo_index_val AS bigint) FROM `a`.`meta` WHERE `__mo_index_key` = 'version')) AS `centroids`
--         GROUP BY
--            `__mo_index_centroid_version`,
--            `__mo_org_tbl_pk_may_serial_col
--
--      ) AS `__mo_index_tbl_join_centroids`
--ON
--    `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col` = `t5`.`__mo_org_tbl_pk_may_serial_col`
--ORDER BY
--    `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version`,
--    `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id`;

image

arjunsk avatar May 10 '24 20:05 arjunsk

EXPLAIN ANALYZE


+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| QUERY PLAN                                                                                                                                                                                                                                                                   |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Plan 0:                                                                                                                                                                                                                                                                      |
| Sink                                                                                                                                                                                                                                                                         |
|   Analyze: timeConsumed=4218206ms waitTime=4194757ms inputRows=2000000 outputRows=0 InputSize=7431mb OutputSize=0bytes MemorySize=64mb                                                                                                                                       |
|   ->  Lock                                                                                                                                                                                                                                                                   |
|         Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes                                                                                                                                                 |
|         ->  PreInsert on a.entries                                                                                                                                                                                                                                           |
|               Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes                                                                                                                                           |
|               ->  Project                                                                                                                                                                                                                                                    |
|                     Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes                                                                                                                                     |
|                     ->  Sort                                                                                                                                                                                                                                                 |
|                           Analyze: timeConsumed=12467ms sort_time=[total=5928ms,min=496ms,max=822ms,dop=10] mergesort_time=[6538ms] waitTime=690181ms inputRows=0 outputRows=1000000 InputSize=0bytes OutputSize=3704mb MemorySize=0bytes                                    |
|                           Sort Key: __mo_index_tbl_join_centroids.__mo_index_centroid_version INTERNAL, __mo_index_tbl_join_centroids.__mo_index_joined_centroid_id INTERNAL                                                                                                 |
|                           ->  Project                                                                                                                                                                                                                                        |
|                                 Analyze: timeConsumed=570ms waitTime=0ms inputRows=1000000 outputRows=1000000 InputSize=3719mb OutputSize=3704mb MemorySize=0bytes                                                                                                           |
|                                 ->  Join                                                                                                                                                                                                                                     |
|                                       Analyze: timeConsumed=7257ms probe_time=[total=7216ms,min=525ms,max=892ms,dop=10] build_time=[40ms] waitTime=8956081ms inputRows=2000000 outputRows=1000000 InputSize=3723mb OutputSize=3719mb MemorySize=386mb                        |
|                                       Join Type: INNER                                                                                                                                                                                                                       |
|                                       Join Cond: (t5.a = t5.__mo_org_tbl_pk_may_serial_col)                                                                                                                                                                                  |
|                                       ->  Table Scan on a.t5                                                                                                                                                                                                                 |
|                                             Analyze: timeConsumed=24340ms scan_time=[total=24338ms,min=2375ms,max=3719ms,dop=8] waitTime=5547818ms inputRows=1000000 outputRows=1000000 InputSize=3688mb OutputSize=3688mb MemorySize=241mb                                  |
|                                       ->  Aggregate                                                                                                                                                                                                                          |
|                                             Analyze: timeConsumed=2842570ms group_time=[total=2842240ms,min=278518ms,max=293388ms,dop=10] mergegroup_time=[329ms] waitTime=1375273ms inputRows=500000000 outputRows=1000000 InputSize=3607gb OutputSize=34mb MemorySize=22mb |
|                                             Group Key: centroids.__mo_index_centroid_version, t5.a                                                                                                                                                                           |
|                                             Aggregate Functions: min(serial_full(l2_distance(centroids.__mo_index_centroid, normalize_l2(t5.b)), centroids.__mo_index_centroid_id))                                                                                          |
|                                             ->  Join                                                                                                                                                                                                                         |
|                                                   Analyze: timeConsumed=3863294ms probe_time=[total=3863290ms,min=381731ms,max=394558ms,dop=10] build_time=[3ms] waitTime=25297ms inputRows=1000500 outputRows=2880 InputSize=3690mb OutputSize=3607gb MemorySize=1mb        |
|                                                   Join Type: INNER                                                                                                                                                                                                           |
|                                                   ->  Table Scan on a.t5                                                                                                                                                                                                     |
|                                                         Analyze: timeConsumed=23316ms scan_time=[total=23311ms,min=2071ms,max=3443ms,dop=8] waitTime=863ms inputRows=1000000 outputRows=1000000 InputSize=3688mb OutputSize=3688mb MemorySize=241mb                          |
|                                                   ->  Filter                                                                                                                                                                                                                 |
|                                                         Analyze: timeConsumed=0ms waitTime=0ms inputRows=500 outputRows=500 InputSize=1mb OutputSize=1mb MemorySize=500bytes                                                                                                 |
|                                                         Filter Cond: (centroids.__mo_index_centroid_version = cast(meta.__mo_index_val AS BIGINT))                                                                                                                           |
|                                                         ->  Join                                                                                                                                                                                                             |
|                                                               Analyze: timeConsumed=0ms probe_time=[0ms,0ms] build_time=[0ms] waitTime=22ms inputRows=501 outputRows=500 InputSize=1mb OutputSize=1mb MemorySize=24bytes                                                     |
|                                                               Join Type: SINGLE                                                                                                                                                                                              |
|                                                               ->  Table Scan on a.centroids                                                                                                                                                                                  |
|                                                                     Analyze: timeConsumed=5ms waitTime=0ms inputRows=500 outputRows=500 InputSize=1mb OutputSize=1mb MemorySize=1mb                                                                                          |
|                                                               ->  Table Scan on a.meta                                                                                                                                                                                       |
|                                                                     Analyze: timeConsumed=0ms waitTime=0ms inputRows=1 outputRows=1 InputSize=48bytes OutputSize=24bytes MemorySize=49bytes                                                                                  |
|                                                                     Filter Cond: (meta.__mo_index_key = 'version')                                                                                                                                                           |
| Plan 1:                                                                                                                                                                                                                                                                      |
| Insert on a.entries                                                                                                                                                                                                                                                          |
|   Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes                                                                                                                                                       |
|   ->  Sink Scan                                                                                                                                                                                                                                                              |
|         DataSource: Plan 0                                                                                                                                                                                                                                                   |
|         Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes                                                                                                                                                 |
| Plan 2:                                                                                                                                                                                                                                                                      |
| Fuzzy Filter for duplicate key                                                                                                                                                                                                                                               |
|   Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes                                                                                                                                                       |
|   Runtime Filter Build: #[0,0]                                                                                                                                                                                                                                               |
|   ->  Table Scan on a.entries                                                                                                                                                                                                                                                |
|         Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes                                                                                                                                                 |
|         Runtime Filter Probe: __mo_cpkey_col                                                                                                                                                                                                                                 |
|   ->  Sink Scan                                                                                                                                                                                                                                                              |
|         DataSource: Plan 0                                                                                                                                                                                                                                                   |
|         Analyze: timeConsumed=5ms waitTime=0ms inputRows=1000000 outputRows=1000000 InputSize=3726mb OutputSize=22mb MemorySize=414696bytes                                                                                                                                  |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
56 rows in set (11 min 44.00 sec)

CPU Profile

image

cpu.pprof.zip

arjunsk avatar May 10 '24 20:05 arjunsk

Proposal

Implement L2 INNER JOIN

select centroids.version, centroids.id, centroids.centroid, tbl.embedding from centroids l2 inner join tbl where centroids.centroid = tbl.embedding

NOTES

  • where centroids.centroid = tbl.embedding is only used to map embedding columns from both the tables. We will map the closest centroid to the table embedding.

arjunsk avatar May 10 '24 20:05 arjunsk

EXPLAIN with SELECT instead of INSERT

mysql> explain analyze select `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version` , `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id` , `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col` , `t5`.`b`  from (select `t5`.`a` as `__mo_org_tbl_pk_may_serial_col`, `t5`.`b` from `a`.`t5`) as `t5` inner join (select `centroids`.`__mo_index_centroid_version` as `__mo_index_centroid_version`, serial_extract( min( serial_full( l2_distance(`centroids`.`__mo_index_centroid`, `t5`.`__mo_org_tbl_norm_vec_col`), `centroids`.`__mo_index_centroid_id`)), 1 as bigint)  as `__mo_index_joined_centroid_id`, `__mo_org_tbl_pk_may_serial_col`from (select `t5`.`a` as `__mo_org_tbl_pk_may_serial_col`, normalize_l2(`t5`.`b`) as `__mo_org_tbl_norm_vec_col`  from `a`.`t5`) as `t5` CROSS JOIN (select * from `a`.`centroids` where `__mo_index_centroid_version` = (select CAST(__mo_index_val as BIGINT) from `a`.`meta` where `__mo_index_key` = 'version'))  as `centroids` group by `__mo_index_centroid_version`, __mo_org_tbl_pk_may_serial_col ) as `__mo_index_tbl_join_centroids` on `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col` = `t5`.`__mo_org_tbl_pk_may_serial_col` order by `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version`, `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id` ;
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| QUERY PLAN                                                                                                                                                                                                                                                 |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Project                                                                                                                                                                                                                                                    |
|   Analyze: timeConsumed=0ms waitTime=693150ms inputRows=1000000 outputRows=1000000 InputSize=3704mb OutputSize=3704mb MemorySize=0bytes                                                                                                                    |
|   ->  Sort                                                                                                                                                                                                                                                 |
|         Analyze: timeConsumed=10567ms sort_time=[total=5897ms,min=422ms,max=1035ms,dop=10] mergesort_time=[4670ms] waitTime=688479ms inputRows=0 outputRows=1000000 InputSize=0bytes OutputSize=3704mb MemorySize=0bytes                                   |
|         Sort Key: __mo_index_tbl_join_centroids.__mo_index_centroid_version INTERNAL, __mo_index_tbl_join_centroids.__mo_index_joined_centroid_id INTERNAL                                                                                                 |
|         ->  Project                                                                                                                                                                                                                                        |
|               Analyze: timeConsumed=496ms waitTime=0ms inputRows=1000000 outputRows=1000000 InputSize=3719mb OutputSize=3704mb MemorySize=0bytes                                                                                                           |
|               ->  Join                                                                                                                                                                                                                                     |
|                     Analyze: timeConsumed=7303ms probe_time=[total=7261ms,min=467ms,max=1013ms,dop=10] build_time=[40ms] waitTime=8933913ms inputRows=2000000 outputRows=1000000 InputSize=3723mb OutputSize=3719mb MemorySize=386mb                       |
|                     Join Type: INNER                                                                                                                                                                                                                       |
|                     Join Cond: (t5.a = t5.__mo_org_tbl_pk_may_serial_col)                                                                                                                                                                                  |
|                     ->  Table Scan on a.t5                                                                                                                                                                                                                 |
|                           Analyze: timeConsumed=27262ms scan_time=[total=27262ms,min=2252ms,max=4212ms,dop=8] waitTime=2265ms inputRows=1000000 outputRows=1000000 InputSize=3688mb OutputSize=3688mb MemorySize=241mb                                     |
|                     ->  Aggregate                                                                                                                                                                                                                          |
|                           Analyze: timeConsumed=2627848ms group_time=[total=2627668ms,min=255031ms,max=278412ms,dop=10] mergegroup_time=[179ms] waitTime=1371963ms inputRows=500000000 outputRows=1000000 InputSize=3607gb OutputSize=34mb MemorySize=22mb |
|                           Group Key: centroids.__mo_index_centroid_version, t5.a                                                                                                                                                                           |
|                           Aggregate Functions: min(serial_full(l2_distance(centroids.__mo_index_centroid, normalize_l2(t5.b)), centroids.__mo_index_centroid_id))                                                                                          |
|                           ->  Join                                                                                                                                                                                                                         |
|                                 Analyze: timeConsumed=4021931ms probe_time=[total=4021929ms,min=398229ms,max=408028ms,dop=10] build_time=[0ms] waitTime=50223ms inputRows=1000500 outputRows=2880 InputSize=3690mb OutputSize=3607gb MemorySize=1mb        |
|                                 Join Type: INNER                                                                                                                                                                                                           |
|                                 ->  Table Scan on a.t5                                                                                                                                                                                                     |
|                                       Analyze: timeConsumed=38500ms scan_time=[total=38498ms,min=4301ms,max=5728ms,dop=8] waitTime=1751ms inputRows=1000000 outputRows=1000000 InputSize=3688mb OutputSize=3688mb MemorySize=241mb                         |
|                                 ->  Filter                                                                                                                                                                                                                 |
|                                       Analyze: timeConsumed=0ms waitTime=0ms inputRows=500 outputRows=500 InputSize=1mb OutputSize=1mb MemorySize=500bytes                                                                                                 |
|                                       Filter Cond: (centroids.__mo_index_centroid_version = cast(meta.__mo_index_val AS BIGINT))                                                                                                                           |
|                                       ->  Join                                                                                                                                                                                                             |
|                                             Analyze: timeConsumed=0ms probe_time=[0ms,0ms] build_time=[0ms] waitTime=17ms inputRows=501 outputRows=500 InputSize=1mb OutputSize=1mb MemorySize=24bytes                                                     |
|                                             Join Type: SINGLE                                                                                                                                                                                              |
|                                             ->  Table Scan on a.centroids                                                                                                                                                                                  |
|                                                   Analyze: timeConsumed=3ms waitTime=0ms inputRows=500 outputRows=500 InputSize=1mb OutputSize=1mb MemorySize=1mb                                                                                          |
|                                             ->  Table Scan on a.meta                                                                                                                                                                                       |
|                                                   Analyze: timeConsumed=0ms waitTime=0ms inputRows=1 outputRows=1 InputSize=48bytes OutputSize=24bytes MemorySize=49bytes                                                                                  |
|                                                   Filter Cond: (meta.__mo_index_key = 'version')                                                                                                                                                           |
|                                                   Block Filter Cond: (meta.__mo_index_key = 'version')                                                                                                                                                     |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
34 rows in set (11 min 33.18 sec)



arjunsk avatar May 15 '24 01:05 arjunsk

image

arjunsk avatar May 17 '24 21:05 arjunsk

GIST 960 takes 3 minutes and 15 seconds to build.

Add the below configs to cn.toml


[[fileservice]]
name = "SHARED"
backend = "DISK"
data-dir = "mo-data/shared"

[fileservice.cache]
memory-capacity = "6GB"
disk-path = "mo-data/file-service-cache"

arjunsk avatar Jun 01 '24 20:06 arjunsk