Comments (1)
Steps to replicate
-
Load t3 table data
-
Run the below queries
drop table if exists meta;
drop table if exists centroids;
drop table if exists entries;
create table `a`.`meta` (`__mo_index_key` VARCHAR(65535),`__mo_index_val` VARCHAR(65535), primary key ( __mo_index_key ) );
create table `a`.`centroids` (`__mo_index_centroid_version` BIGINT,`__mo_index_centroid_id` BIGINT,`__mo_index_centroid` VECF32(128), primary key ( __mo_index_centroid_version,__mo_index_centroid_id ) );
create table `a`.`entries` (`__mo_index_centroid_fk_version` BIGINT,`__mo_index_centroid_fk_id` BIGINT,`__mo_index_pri_col` INT,`__mo_index_centroid_fk_entry` VECF32(128), primary key ( __mo_index_centroid_fk_version,__mo_index_centroid_fk_id,__mo_index_pri_col ) );
insert into `a`.`meta` (`__mo_index_key`, `__mo_index_val`) values('version', '0')ON DUPLICATE KEY UPDATE `__mo_index_val` = CAST( (CAST(`__mo_index_val` AS BIGINT) + 1) AS CHAR);
insert into `a`.`centroids` (`__mo_index_centroid_version`, `__mo_index_centroid_id`, `__mo_index_centroid`) SELECT (SELECT CAST(`__mo_index_val` AS BIGINT) FROM `meta` WHERE `__mo_index_key` = 'version'), ROW_NUMBER() OVER(), cast(`__mo_index_unnest_cols`.`value` as VARCHAR) FROM (SELECT cluster_centers(`b` kmeans '500,vector_l2_ops,kmeansplusplus,true') AS `__mo_index_centroids_string` FROM (select sample(`b`, 10000 rows, 'row') as `b` from `a`.`t3`) ) AS `__mo_index_centroids_tbl` CROSS JOIN UNNEST(`__mo_index_centroids_tbl`.`__mo_index_centroids_string`) AS `__mo_index_unnest_cols`;
- Run the bottleneck query
explain analyze insert into `a`.`entries` (`__mo_index_centroid_fk_version`, `__mo_index_centroid_fk_id`, `__mo_index_pri_col`, `__mo_index_centroid_fk_entry`) select `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version` , `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id` , `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col` , `t3`.`b` from (select `t3`.`a` as `__mo_org_tbl_pk_may_serial_col`, `t3`.`b` from `a`.`t3`) as `t3` inner join (select `centroids`.`__mo_index_centroid_version` as `__mo_index_centroid_version`, serial_extract( min( serial_full( l2_distance(`centroids`.`__mo_index_centroid`, `t3`.`__mo_org_tbl_norm_vec_col`), `centroids`.`__mo_index_centroid_id`)), 1 as bigint) as `__mo_index_joined_centroid_id`, `__mo_org_tbl_pk_may_serial_col`from (select `t3`.`a` as `__mo_org_tbl_pk_may_serial_col`, normalize_l2(`t3`.`b`) as `__mo_org_tbl_norm_vec_col` from `a`.`t3`) as `t3` CROSS JOIN (select * from `a`.`centroids` where `__mo_index_centroid_version` = (select CAST(__mo_index_val as BIGINT) from `a`.`meta` where `__mo_index_key` = 'version')) as `centroids` group by `__mo_index_centroid_version`, __mo_org_tbl_pk_may_serial_col ) as `__mo_index_tbl_join_centroids` on `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col` = `t3`.`__mo_org_tbl_pk_may_serial_col` order by `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version`, `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id` ;
-- ### SQL Pretty Print ###
--
--INSERT INTO `a`.`entries` (`__mo_index_centroid_fk_version`, `__mo_index_centroid_fk_id`, `__mo_index_pri_col`, `__mo_index_centroid_fk_entry`)
--SELECT
-- `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version`,
-- `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id`,
-- `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col`,
-- `t3`.`b`
--FROM
-- (SELECT `t3`.`a` AS `__mo_org_tbl_pk_may_serial_col`, `t3`.`b` FROM `a`.`t3`) AS `t3`
--INNER JOIN
-- (
-- SELECT
-- `centroids`.`__mo_index_centroid_version` AS `__mo_index_centroid_version`,
-- serial_extract(min(serial_full(l2_distance(`centroids`.`__mo_index_centroid`, `t3`.`__mo_org_tbl_norm_vec_col`), `centroids`.`__mo_index_centroid_id`)), 1 AS bigint) AS `__mo_index_joined_centroid_id`,
-- `__mo_org_tbl_pk_may_serial_col`
-- FROM
-- (SELECT `t3`.`a` AS `__mo_org_tbl_pk_may_serial_col`,normalize_l2(`t3`.`b`) AS `__mo_org_tbl_norm_vec_col` FROM `a`.`t3`) AS `t3`
-- CROSS JOIN
-- (SELECT * FROM `a`.`centroids` WHERE `__mo_index_centroid_version` = (SELECT CAST(__mo_index_val AS bigint) FROM `a`.`meta` WHERE `__mo_index_key` = 'version')) AS `centroids`
-- GROUP BY
-- `__mo_index_centroid_version`,
-- `__mo_org_tbl_pk_may_serial_col
--
-- ) AS `__mo_index_tbl_join_centroids`
--ON
-- `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col` = `t3`.`__mo_org_tbl_pk_may_serial_col`
--ORDER BY
-- `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version`,
-- `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id`;
EXPLAIN ANALYZE
QUERY PLAN |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
Plan 0: |
Sink |
Analyze: timeConsumed=204857ms waitTime=1212935ms inputRows=2000000 outputRows=0 InputSize=1083mb OutputSize=0bytes MemorySize=66mb |
-> Lock |
Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes |
-> PreInsert on a.entries |
Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes |
-> Project |
Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes |
-> Sort |
Analyze: timeConsumed=2060ms sort_time=[total=1171ms,min=88ms,max=138ms,dop=10] mergesort_time=[889ms] waitTime=199967ms inputRows=0 outputRows=1000000 InputSize=0bytes OutputSize=530mb MemorySize=0bytes |
Sort Key: __mo_index_tbl_join_centroids.__mo_index_centroid_version INTERNAL, __mo_index_tbl_join_centroids.__mo_index_joined_centroid_id INTERNAL |
-> Project |
Analyze: timeConsumed=442ms waitTime=0ms inputRows=1000000 outputRows=1000000 InputSize=545mb OutputSize=530mb MemorySize=0bytes |
-> Join |
Analyze: timeConsumed=934ms probe_time=[total=897ms,min=58ms,max=128ms,dop=10] build_time=[37ms] waitTime=2596446ms inputRows=2000000 outputRows=1000000 InputSize=549mb OutputSize=545mb MemorySize=386mb |
Join Type: INNER |
Join Cond: (t3.a = t3.__mo_org_tbl_pk_may_serial_col) |
-> Table Scan on a.t3 |
Analyze: timeConsumed=1823ms scan_time=[total=1822ms,min=150ms,max=289ms,dop=8] waitTime=604587ms inputRows=1000000 outputRows=1000000 InputSize=514mb OutputSize=514mb MemorySize=33mb |
-> Aggregate |
Analyze: timeConsumed=1392639ms group_time=[total=1392438ms,min=128546ms,max=146959ms,dop=10] mergegroup_time=[200ms] waitTime=398987ms inputRows=500000000 outputRows=1000000 InputSize=508gb OutputSize=34mb Memo|
Group Key: centroids.__mo_index_centroid_version, t3.a |
Aggregate Functions: min(serial_full(l2_distance(centroids.__mo_index_centroid, normalize_l2(t3.b)), centroids.__mo_index_centroid_id)) |
-> Join |
Analyze: timeConsumed=501759ms probe_time=[total=501759ms,min=47845ms,max=52527ms,dop=10] build_time=[0ms] waitTime=43308ms inputRows=1000500 outputRows=10237 InputSize=515mb OutputSize=508gb MemorySize=27|
Join Type: INNER |
-> Table Scan on a.t3 |
Analyze: timeConsumed=2448ms scan_time=[total=2432ms,min=241ms,max=373ms,dop=8] waitTime=142ms inputRows=1000000 outputRows=1000000 InputSize=514mb OutputSize=514mb MemorySize=33mb |
-> Filter |
Analyze: timeConsumed=0ms waitTime=0ms inputRows=500 outputRows=500 InputSize=288000bytes OutputSize=276000bytes MemorySize=500bytes |
Filter Cond: (centroids.__mo_index_centroid_version = cast(meta.__mo_index_val AS BIGINT)) |
-> Join |
Analyze: timeConsumed=0ms waitTime=2ms inputRows=1 outputRows=500 InputSize=24bytes OutputSize=288000bytes MemorySize=24bytes |
Join Type: SINGLE |
-> Table Scan on a.centroids |
Analyze: timeConsumed=0ms waitTime=0ms inputRows=500 outputRows=500 InputSize=276000bytes OutputSize=276000bytes MemorySize=276000bytes |
-> Table Scan on a.meta |
Analyze: timeConsumed=0ms waitTime=0ms inputRows=1 outputRows=1 InputSize=48bytes OutputSize=24bytes MemorySize=49bytes |
Filter Cond: (meta.__mo_index_key = 'version') |
Plan 1: |
Insert on a.entries |
Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes |
-> Sink Scan |
DataSource: Plan 0 |
Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes |
Plan 2: |
Fuzzy Filter for duplicate key |
Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes |
Runtime Filter Build: #[0,0] |
-> Table Scan on a.entries |
Analyze: timeConsumed=0ms waitTime=0ms inputRows=0 outputRows=0 InputSize=0bytes OutputSize=0bytes MemorySize=0bytes |
Runtime Filter Probe: __mo_cpkey_col |
-> Sink Scan |
DataSource: Plan 0 |
Analyze: timeConsumed=3ms waitTime=0ms inputRows=1000000 outputRows=1000000 InputSize=553mb OutputSize=22mb MemorySize=2mb |
CPU Profile
- Run the command
curl -o cpu.pprof "http://localhost:9876/debug/pprof/profile?seconds=180";
go tool pprof -http=:8080 cpu.pprof
Heap Profile
- Run
curl -o heap.pprof "http://localhost:9876/debug/pprof/heap?seconds=180"
go tool pprof -http=:8080 heap.pprof;
from matrixone.
Related Issues (20)
- [Bug]: rolling-update CN behind proxy cause prepared stmt lost HOT 1
- [Bug]: force flush failed. HOT 3
- [Bug]: snapshot bvt may panic HOT 3
- [Bug]: restore account HOT 3
- [Bug]: restore account if exists fk table ref other databasee will report "can not drop database. It has been referenced by foreign keys" HOT 1
- [Tech Request]: rename restrict operator to filter
- [Tech Request]: optmize duplicate check memory usage for sql like insert into t1 selct from t2
- [Bug]: stock_level txn timeout in 60s in tpcc 10-10 longrunning during statbility test on distributed HOT 2
- [Bug]: [tke regression] tpcc 100-1000 test oom. HOT 6
- [Bug]: CI/multi cn e2e run bvt `load data LOCAL infile` got FileNotFoundException HOT 4
- [Bug]: UT TestKill got error `panic BUG: StartStatement called twice`
- [Tech Request]: support read from cache without copy
- [Tech Request]: add txn id in log
- [Bug]: restore fk table report can not find table by id in ci HOT 1
- [Bug]: Optimizer doesn't support ProjectList in SEMI Join
- [Bug]: restore single db multi table with fk report 'no such table'.
- [Bug]: panic during merge when object is empty
- [Bug]: `context deadline exceeded` error occurs
- [Bug]: New account used about 16GB storage HOT 1
- [Bug]: w-w conflict in MO Checkin Regression(Standalone BVT) / Multi-CN e2e BVT(Race) Test on Linux/x64
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from matrixone.