Embeddings: implement db package (#55372)

This implements a new `VectorDB` abstraction with a qdrant backend. It's
not used yet. Stacked PRs will incorporate this into the existing stack.
This commit is contained in:
Camden Cheek 2023-08-02 12:02:17 -06:00 committed by GitHub
parent 60c1f0bb31
commit 2a7ea9d279
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 631 additions and 0 deletions

View File

@ -5689,6 +5689,13 @@ def go_dependencies():
sum = "h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M=",
version = "v0.0.0-20170810143723-de5bf2ad4578",
)
go_repository(
name = "com_github_qdrant_go_client",
build_file_proto_mode = "disable_global",
importpath = "github.com/qdrant/go-client",
sum = "h1:LckV8C0TtMPDqWPd5g5sIa9zELQMelRlcZk9ANSZ2H8=",
version = "v1.3.0",
)
go_repository(
name = "com_github_quasoft_websspi",

1
go.mod
View File

@ -271,6 +271,7 @@ require (
github.com/jackc/pgerrcode v0.0.0-20220416144525-469b46aa5efa
github.com/mitchellh/go-homedir v1.1.0
github.com/mroth/weightedrand/v2 v2.0.1
github.com/qdrant/go-client v1.3.0
github.com/ricochet2200/go-disk-usage/du v0.0.0-20210707232629-ac9918953285
github.com/tj/assert v0.0.0-20190920132354-ee03d75cd160
github.com/vektah/gqlparser/v2 v2.4.5

16
go.sum
View File

@ -130,6 +130,7 @@ github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBp
github.com/AzureAD/microsoft-authentication-library-for-go v0.5.1/go.mod h1:Vt9sXTKwMyGcOxSmLDMnGPgqsUg7m8pe215qMLrDXw4=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/CloudyKit/fastprinter v0.0.0-20200109182630-33d98a066a53/go.mod h1:+3IMCy2vIlbG1XG/0ggNQv0SvxCAIpPM5b1nCz56Xno=
github.com/DataDog/zstd v1.5.0 h1:+K/VEwIAaPcHiMtQvpLD4lqW7f0Gk3xdYZmI1hD+CXo=
github.com/DataDog/zstd v1.5.0/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw=
github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.16.1 h1:/o9L4jKKshKO6U4q6e5oo0SkVtF5DDNLGK+liqsDt+w=
@ -255,6 +256,7 @@ github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmV
github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
github.com/asaskevich/govalidator v0.0.0-20180720115003-f9ffefc3facf/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
github.com/asaskevich/govalidator v0.0.0-20200108200545-475eaeb16496/go.mod h1:oGkLhpf+kjZl6xBf758TQhh5XrAeiJv/7FRz/2spLIg=
@ -452,6 +454,7 @@ github.com/cockroachdb/apd/v2 v2.0.1 h1:y1Rh3tEU89D+7Tgbw+lp52T6p/GJLpDmNvr10UWq
github.com/cockroachdb/apd/v2 v2.0.1/go.mod h1:DDxRlzC2lo3/vSlmSoS7JkqbbrARPuFOGr0B9pvN3Gw=
github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:zn76sxSg3SzpJ0PPJaLDCu+Bu0Lg3sKTORVIj19EIF8=
github.com/cockroachdb/datadriven v0.0.0-20200714090401-bf6692d28da5/go.mod h1:h6jFvWxBdQXxjopDMZyH2UVceIRfR84bdzbkoKrsWNo=
github.com/cockroachdb/datadriven v1.0.2/go.mod h1:a9RdTaap04u637JoCzcUoIcDmvwSUtcUFtT/C3kJlTU=
github.com/cockroachdb/errors v1.2.4/go.mod h1:rQD95gz6FARkaKkQXUksEje/d9a6wBJoCr5oaCLELYA=
github.com/cockroachdb/errors v1.10.0 h1:lfxS8zZz1+OjtV4MtNWgboi/W5tyLEB6VQZBXN+0VUU=
github.com/cockroachdb/errors v1.10.0/go.mod h1:lknhIsEVQ9Ss/qKDBQS/UqFSvPQjOwNq2qyKAxtHRqE=
@ -460,6 +463,7 @@ github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZe
github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs=
github.com/cockroachdb/redact v1.1.5 h1:u1PMllDkdFfPWaNGMyLD1+so+aq3uUItthCFqzwPJ30=
github.com/cockroachdb/redact v1.1.5/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=
github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0/go.mod h1:4Zcjuz89kmFXt9morQgcfYZAYZ5n8WHjt81YYWIwtTM=
github.com/containerd/aufs v0.0.0-20200908144142-dab0cbea06f4/go.mod h1:nukgQABAEopAHvB6j7cnP5zJ+/3aVcE7hCYqvIwAHyE=
github.com/containerd/aufs v0.0.0-20201003224125-76a6863f2989/go.mod h1:AkGGQs9NM2vtYHaUen+NljV0/baGCAPELGm2q9ZXpWU=
github.com/containerd/aufs v0.0.0-20210316121734-20793ff83c97/go.mod h1:kL5kd6KM5TzQjR79jljyi4olc1Vrx6XBlcyj3gNv2PU=
@ -687,6 +691,7 @@ github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkp
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/edsrzf/mmap-go v1.1.0 h1:6EUwBLQ/Mcr1EYLE4Tn1VdW1A4ckqCQWZBw8Hr0kjpQ=
github.com/edsrzf/mmap-go v1.1.0/go.mod h1:19H/e8pUPLicwkyNgOykDXkJ9F0MHE+Z52B8EIth78Q=
github.com/eknkc/amber v0.0.0-20171010120322-cdade1c07385/go.mod h1:0vRUJqYpeSZifjYj7uP3BG/gKcuzL9xWVV/Y+cK33KM=
github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc=
github.com/elazarl/goproxy v0.0.0-20221015165544-a0805db90819 h1:RIB4cRk+lBqKK3Oy0r2gRX4ui7tuhiZq2SuTtTCi0/0=
github.com/elimity-com/scim v0.0.0-20220121082953-15165b1a61c8 h1:6fUaAaX4Xe07LhVrHNmpfnlU41Nsto4skz4vhVqGwYk=
@ -819,6 +824,7 @@ github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbV
github.com/go-logr/stdr v1.2.0/go.mod h1:YkVgnZu1ZjjL7xTxrfm/LLZBfkhTqSR1ydtm6jTKKwI=
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-martini/martini v0.0.0-20170121215854-22fa46961aab/go.mod h1:/P9AEU963A2AYjv4d1V5eVL1CQbEJq6aCNHDDjibzu8=
github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
github.com/go-openapi/analysis v0.0.0-20180825180245-b006789cd277/go.mod h1:k70tL6pCuVxPJOHXQ+wIac1FUrvNkHolPie/cLEU6hI=
@ -1012,6 +1018,7 @@ github.com/gofrs/uuid v4.2.0+incompatible h1:yyYWMnhkhrKwwr8gAOcOCYxOOscHgDS9yZg
github.com/gofrs/uuid v4.2.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM=
github.com/gogo/googleapis v1.2.0/go.mod h1:Njal3psf3qN6dwBtQfUmBZh2ybovJ0tlu3o/AC7HYjU=
github.com/gogo/googleapis v1.4.0/go.mod h1:5YRNX2z1oM5gXdAkurHa942MDgEJyk02w4OecKY87+c=
github.com/gogo/googleapis v1.4.1/go.mod h1:2lpHqI5OcWCtVElxXnPt+s8oJvMpySlOyM6xDCrzib4=
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
github.com/gogo/protobuf v1.2.2-0.20190723190241-65acae22fc9d/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
@ -1019,6 +1026,7 @@ github.com/gogo/protobuf v1.3.0/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXP
github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/gogo/status v1.1.0/go.mod h1:BFv9nrluPLmrS0EmGVvLaPNmRosr9KapBYd5/hpY1WM=
github.com/golang-jwt/jwt v3.2.1+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY=
github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
@ -1317,6 +1325,7 @@ github.com/honeycombio/libhoney-go v1.15.8/go.mod h1:+tnL2etFnJmVx30yqmoUkVyQjp7
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/huandu/xstrings v1.3.2 h1:L18LIDzqlW6xN2rEkpdV8+oL/IXWJ1APd+vsdYy4Wdw=
github.com/huandu/xstrings v1.3.2/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
github.com/iancoleman/strcase v0.2.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w=
@ -1806,6 +1815,7 @@ github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi
github.com/pierrec/lz4/v4 v4.1.17 h1:kV4Ip+/hUBC+8T6+2EgburRtkE9ef4nbY3f4dFhGjMc=
github.com/pierrec/lz4/v4 v4.1.17/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
github.com/pjbgf/sha1cd v0.3.0 h1:4D5XXmUUBUl/xQ6IjCkEAbqXskkq/4O7LmGn0AqMDs4=
github.com/pjbgf/sha1cd v0.3.0/go.mod h1:nZ1rrWOcGJ5uZgEEVL1VUM9iRQiZvWdbZjkKyFzPPsI=
github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4/go.mod h1:4OwLy04Bl9Ef3GJJCoec+30X3LQs/0/m4HFRt/2LUSA=
@ -1878,6 +1888,8 @@ github.com/pseudomuto/protoc-gen-doc v1.5.1 h1:Ah259kcrio7Ix1Rhb6u8FCaOkzf9qRBqX
github.com/pseudomuto/protoc-gen-doc v1.5.1/go.mod h1:XpMKYg6zkcpgfpCfQ8GcWBDRtRxOmMR5w7pz4Xo+dYM=
github.com/pseudomuto/protokit v0.2.0 h1:hlnBDcy3YEDXH7kc9gV+NLaN0cDzhDvD1s7Y6FZ8RpM=
github.com/pseudomuto/protokit v0.2.0/go.mod h1:2PdH30hxVHsup8KpBTOXTBeMVhJZVio3Q8ViKSAXT0Q=
github.com/qdrant/go-client v1.3.0 h1:LckV8C0TtMPDqWPd5g5sIa9zELQMelRlcZk9ANSZ2H8=
github.com/qdrant/go-client v1.3.0/go.mod h1:680gkxNAsVtre0Z8hAQmtPzJtz1xFAyCu2TUxULtnoE=
github.com/qustavo/sqlhooks/v2 v2.1.0 h1:54yBemHnGHp/7xgT+pxwmIlMSDNYKx5JW5dfRAiCZi0=
github.com/qustavo/sqlhooks/v2 v2.1.0/go.mod h1:aMREyKo7fOKTwiLuWPsaHRXEmtqG4yREztO0idF83AU=
github.com/rafaeljusto/redigomock/v3 v3.1.2 h1:B4Y0XJQiPjpwYmkH55aratKX1VfR+JRqzmDKyZbC99o=
@ -1933,6 +1945,7 @@ github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06/go.mod h1:+e
github.com/safchain/ethtool v0.0.0-20190326074333-42ed695e3de8/go.mod h1:Z0q5wiBQGYcxhMZ6gUqHn6pYNLypFAvaL3UvgZLR0U4=
github.com/safchain/ethtool v0.0.0-20210803160452-9aa261dae9b1/go.mod h1:Z0q5wiBQGYcxhMZ6gUqHn6pYNLypFAvaL3UvgZLR0U4=
github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
github.com/schollz/closestmatch v2.1.0+incompatible/go.mod h1:RtP1ddjLong6gTkbtmuhtR2uUrrJOpYzYRvbcPAid+g=
github.com/schollz/progressbar/v3 v3.8.5 h1:VcmmNRO+eFN3B0m5dta6FXYXY+MEJmXdWoIS+jjssQM=
github.com/schollz/progressbar/v3 v3.8.5/go.mod h1:ewO25kD7ZlaJFTvMeOItkOZa8kXu1UvFs379htE8HMQ=
github.com/scim2/filter-parser/v2 v2.2.0 h1:QGadEcsmypxg8gYChRSM2j1edLyE/2j72j+hdmI4BJM=
@ -2132,6 +2145,8 @@ github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtX
github.com/urfave/cli/v2 v2.3.0/go.mod h1:LJmUH05zAU44vOAcrfzZQKsZbVcdbOG8rtL3/XcUArI=
github.com/urfave/cli/v2 v2.23.7 h1:YHDQ46s3VghFHFf1DdF+Sh7H4RqhcM+t0TmZRJx4oJY=
github.com/urfave/cli/v2 v2.23.7/go.mod h1:GHupkWPMM0M/sj1a2b4wUrWBPzazNrIjouW6fmdJLxc=
github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/vektah/gqlparser v1.1.2/go.mod h1:1ycwN7Ij5njmMkPPAOaRFY4rET2Enx7IkVv3vaXspKw=
github.com/vektah/gqlparser/v2 v2.4.0/go.mod h1:flJWIR04IMQPGz+BXLrORkrARBxv/rtyIAFvd/MceW0=
github.com/vektah/gqlparser/v2 v2.4.5 h1:C02NsyEsL4TXJB7ndonqTfuQOL4XPIu0aAWugdmTgmc=
@ -2351,6 +2366,7 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.16.0 h1:iqjq9
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.16.0/go.mod h1:hGXzO5bhhSHZnKvrDaXB82Y9DRFour0Nz/KrBh7reWw=
go.opentelemetry.io/otel/exporters/prometheus v0.39.0 h1:whAaiHxOatgtKd+w0dOi//1KUxj3KoPINZdtDaDj3IA=
go.opentelemetry.io/otel/exporters/prometheus v0.39.0/go.mod h1:4jo5Q4CROlCpSPsXLhymi+LYrDXd2ObU5wbKayfZs7Y=
go.opentelemetry.io/otel/internal/metric v0.27.0/go.mod h1:n1CVxRqKqYZtqyTh9U/onvKapPGv7y/rpyOTI+LFNzw=
go.opentelemetry.io/otel/metric v0.20.0/go.mod h1:598I5tYlH1vzBjn+BTuhzTCSb/9debfNp6R3s7Pr1eU=
go.opentelemetry.io/otel/metric v1.16.0 h1:RbrpwVG1Hfv85LgnZ7+txXioPDoh6EdbZHo26Q3hqOo=
go.opentelemetry.io/otel/metric v1.16.0/go.mod h1:QE47cpOmkwipPiefDwo2wDzwJrlfxxNYodqc4xnGCo4=

View File

@ -0,0 +1,28 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
load("//dev:go_defs.bzl", "go_test")
go_library(
name = "db",
srcs = [
"chunk_point.go",
"db.go",
"migrate.go",
"qdrant.go",
],
importpath = "github.com/sourcegraph/sourcegraph/internal/embeddings/db",
visibility = ["//:__subpackages__"],
deps = [
"//internal/api",
"//lib/errors",
"//lib/pointers",
"@com_github_google_uuid//:uuid",
"@com_github_qdrant_go_client//qdrant",
],
)
go_test(
name = "db_test",
srcs = ["chunk_point_test.go"],
embed = [":db"],
deps = ["@com_github_stretchr_testify//require"],
)

View File

@ -0,0 +1,151 @@
package db
import (
"encoding/binary"
"hash/fnv"
"github.com/google/uuid"
qdrant "github.com/qdrant/go-client/qdrant"
"github.com/sourcegraph/sourcegraph/internal/api"
)
// ChunkResult is a point along with its search score.
type ChunkResult struct {
Point ChunkPoint
Score float32
}
func (c *ChunkResult) FromQdrantResult(res *qdrant.ScoredPoint) error {
u, err := uuid.Parse(res.GetId().GetUuid())
if err != nil {
return err
}
var payload ChunkPayload
payload.FromQdrantPayload(res.GetPayload())
*c = ChunkResult{
Point: ChunkPoint{
ID: u,
Payload: payload,
Vector: res.GetVectors().GetVector().GetData(),
},
Score: res.GetScore(),
}
return nil
}
func NewChunkPoint(payload ChunkPayload, vector []float32) ChunkPoint {
return ChunkPoint{
ID: chunkUUID(
payload.RepoID,
payload.Revision,
payload.FilePath,
payload.StartLine,
payload.EndLine,
),
Payload: payload,
Vector: vector,
}
}
type ChunkPoint struct {
ID uuid.UUID
Payload ChunkPayload
Vector []float32
}
func (c *ChunkPoint) ToQdrantPoint() *qdrant.PointStruct {
return &qdrant.PointStruct{
Id: &qdrant.PointId{
PointIdOptions: &qdrant.PointId_Uuid{
Uuid: c.ID.String(),
},
},
Payload: c.Payload.ToQdrantPayload(),
Vectors: &qdrant.Vectors{
VectorsOptions: &qdrant.Vectors_Vector{
Vector: &qdrant.Vector{
Data: c.Vector,
},
},
},
}
}
type ChunkPoints []ChunkPoint
func (ps ChunkPoints) ToQdrantPoints() []*qdrant.PointStruct {
res := make([]*qdrant.PointStruct, len(ps))
for i, p := range ps {
res[i] = p.ToQdrantPoint()
}
return res
}
type PayloadField = string
const (
fieldRepoID PayloadField = "repoID"
fieldRepoName PayloadField = "repoName"
fieldRevision PayloadField = "revision"
fieldFilePath PayloadField = "filePath"
fieldStartLine PayloadField = "startLine"
fieldEndLine PayloadField = "endLine"
fieldIsCode PayloadField = "isCode"
)
// ChunkPayload is a well-typed representation of the payload we store in the vector DB.
// Changes to the contents of this struct may require a migration of the data in the DB.
type ChunkPayload struct {
RepoName api.RepoName
RepoID api.RepoID
Revision api.CommitID
FilePath string
StartLine, EndLine uint32
IsCode bool
}
func (p *ChunkPayload) ToQdrantPayload() map[string]*qdrant.Value {
return map[string]*qdrant.Value{
fieldRepoID: {Kind: &qdrant.Value_IntegerValue{IntegerValue: int64(p.RepoID)}},
fieldRepoName: {Kind: &qdrant.Value_StringValue{StringValue: string(p.RepoName)}},
fieldRevision: {Kind: &qdrant.Value_StringValue{StringValue: string(p.Revision)}},
fieldFilePath: {Kind: &qdrant.Value_StringValue{StringValue: p.FilePath}},
fieldStartLine: {Kind: &qdrant.Value_IntegerValue{IntegerValue: int64(p.StartLine)}},
fieldEndLine: {Kind: &qdrant.Value_IntegerValue{IntegerValue: int64(p.EndLine)}},
fieldIsCode: {Kind: &qdrant.Value_BoolValue{BoolValue: p.IsCode}},
}
}
func (p *ChunkPayload) FromQdrantPayload(payload map[string]*qdrant.Value) {
*p = ChunkPayload{
RepoName: api.RepoName(payload[fieldRepoName].GetStringValue()),
RepoID: api.RepoID(payload[fieldRepoID].GetIntegerValue()),
Revision: api.CommitID(payload[fieldRevision].GetStringValue()),
FilePath: payload[fieldFilePath].GetStringValue(),
StartLine: uint32(payload[fieldStartLine].GetIntegerValue()),
EndLine: uint32(payload[fieldEndLine].GetIntegerValue()),
IsCode: payload[fieldIsCode].GetBoolValue(),
}
}
// chunkUUID generates a stable UUID for a file chunk. It is not strictly necessary to have a stable ID,
// but it does make it easier to reason about idempotent updates.
func chunkUUID(repoID api.RepoID, revision api.CommitID, filePath string, startLine, endLine uint32) uuid.UUID {
hasher := fnv.New128()
var buf [4]byte
binary.LittleEndian.PutUint32(buf[:], uint32(repoID))
hasher.Write(buf[:])
hasher.Write([]byte(revision))
hasher.Write([]byte(filePath))
binary.LittleEndian.PutUint32(buf[:], startLine)
binary.LittleEndian.PutUint32(buf[:], endLine)
hasher.Write(buf[:])
var u uuid.UUID
sum := hasher.Sum(nil)
copy(u[:], sum)
return u
}

View File

@ -0,0 +1,26 @@
package db
import (
"testing"
"github.com/stretchr/testify/require"
)
func TestPayload(t *testing.T) {
t.Run("roundtrip", func(t *testing.T) {
pp := ChunkPayload{
RepoName: "a",
RepoID: 2,
Revision: "c",
FilePath: "d",
StartLine: 5,
EndLine: 6,
IsCode: false,
}
qp := pp.ToQdrantPayload()
var newPP ChunkPayload
newPP.FromQdrantPayload(qp)
require.Equal(t, pp, newPP)
})
}

View File

@ -0,0 +1,29 @@
package db
import (
"context"
"fmt"
"strings"
"github.com/sourcegraph/sourcegraph/internal/api"
)
type VectorDB interface {
VectorSearcher
VectorInserter
}
type VectorSearcher interface {
Search(context.Context, SearchParams) ([]ChunkResult, error)
}
type VectorInserter interface {
PrepareUpdate(ctx context.Context, modelID string, modelDims uint64) error
HasIndex(ctx context.Context, modelID string, repoID api.RepoID, revision api.CommitID) (bool, error)
InsertChunks(context.Context, InsertParams) error
FinalizeUpdate(context.Context, FinalizeUpdateParams) error
}
func CollectionName(modelID string) string {
return fmt.Sprintf("repos.%s", strings.ReplaceAll(modelID, "/", "."))
}

View File

@ -0,0 +1,106 @@
package db
import (
"context"
qdrant "github.com/qdrant/go-client/qdrant"
"github.com/sourcegraph/sourcegraph/lib/errors"
"github.com/sourcegraph/sourcegraph/lib/pointers"
)
func ensureModelCollectionWithDefaultConfig(ctx context.Context, cc qdrant.CollectionsClient, modelID string, modelDims uint64) error {
// Make the actual collection end with `.default` so we can switch between
// configurations with aliases.
name := CollectionName(modelID)
realName := name + ".default"
err := ensureCollection(ctx, cc, realName, defaultConfig(modelDims))
if err != nil {
return err
}
// Update the alias atomically to point to the new collection
_, err = cc.UpdateAliases(ctx, &qdrant.ChangeAliases{
Actions: []*qdrant.AliasOperations{{
Action: &qdrant.AliasOperations_CreateAlias{
CreateAlias: &qdrant.CreateAlias{
CollectionName: realName,
AliasName: name,
},
},
}},
})
if err != nil {
return errors.Wrap(err, "update aliases")
}
return nil
}
func ensureCollection(ctx context.Context, cc qdrant.CollectionsClient, name string, config *qdrant.CollectionConfig) error {
resp, err := cc.List(ctx, &qdrant.ListCollectionsRequest{})
if err != nil {
return err
}
for _, collection := range resp.GetCollections() {
if collection.GetName() == name {
// Collection already exists
return nil
}
}
// Create a new collection with the new config using the data of the old collection
_, err = cc.Create(ctx, &qdrant.CreateCollection{
CollectionName: name,
HnswConfig: config.HnswConfig,
WalConfig: config.WalConfig,
OptimizersConfig: config.OptimizerConfig,
ShardNumber: &config.Params.ShardNumber,
OnDiskPayload: &config.Params.OnDiskPayload,
VectorsConfig: config.Params.VectorsConfig,
ReplicationFactor: config.Params.ReplicationFactor,
WriteConsistencyFactor: config.Params.WriteConsistencyFactor,
InitFromCollection: nil,
QuantizationConfig: config.QuantizationConfig,
})
return err
}
// TODO: loudly document that changing this will cause a rebuild of the vector indexes
func defaultConfig(dims uint64) *qdrant.CollectionConfig {
return &qdrant.CollectionConfig{
Params: &qdrant.CollectionParams{
ShardNumber: 1,
OnDiskPayload: true,
VectorsConfig: &qdrant.VectorsConfig{
Config: &qdrant.VectorsConfig_Params{
Params: &qdrant.VectorParams{
Size: dims,
Distance: qdrant.Distance_Cosine,
HnswConfig: nil, // use collection default
QuantizationConfig: nil, // use collection default
OnDisk: pointers.Ptr(true), // use collection default
},
},
},
ReplicationFactor: nil, // default
WriteConsistencyFactor: nil, // default
},
OptimizerConfig: &qdrant.OptimizersConfigDiff{
IndexingThreshold: pointers.Ptr(uint64(0)), // disable indexing
},
WalConfig: nil, // default
QuantizationConfig: &qdrant.QuantizationConfig{
// scalar is faster than product, but doesn't compress as well
Quantization: &qdrant.QuantizationConfig_Scalar{
Scalar: &qdrant.ScalarQuantization{
Type: qdrant.QuantizationType_Int8,
// Truncate outliers for better compression
Quantile: pointers.Ptr(float32(0.98)),
AlwaysRam: nil, // default false
},
},
},
}
}

View File

@ -0,0 +1,267 @@
package db
import (
"context"
qdrant "github.com/qdrant/go-client/qdrant"
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/lib/pointers"
)
func NewQdrantDB(pointsClient qdrant.PointsClient, collectionsClient qdrant.CollectionsClient) VectorDB {
return &qdrantDB{
pointsClient: pointsClient,
collectionsClient: collectionsClient,
}
}
type qdrantDB struct {
pointsClient qdrant.PointsClient
collectionsClient qdrant.CollectionsClient
}
var _ VectorDB = (*qdrantDB)(nil)
type SearchParams struct {
ModelID string
RepoIDs []api.RepoID
Query []float32
CodeLimit int
TextLimit int
}
func (db *qdrantDB) Search(ctx context.Context, params SearchParams) ([]ChunkResult, error) {
collectionName := CollectionName(params.ModelID)
getSearchPoints := func(isCode bool) *qdrant.SearchPoints {
var limit uint64
if isCode {
limit = uint64(params.CodeLimit)
} else {
limit = uint64(params.TextLimit)
}
return &qdrant.SearchPoints{
CollectionName: collectionName,
Vector: params.Query,
WithPayload: fullPayloadSelector,
Filter: &qdrant.Filter{
Should: repoIDsConditions(params.RepoIDs),
Must: []*qdrant.Condition{isCodeCondition(isCode)},
},
Limit: limit,
}
}
codeSearch := getSearchPoints(true)
textSearch := getSearchPoints(false)
resp, err := db.pointsClient.SearchBatch(ctx, &qdrant.SearchBatchPoints{
CollectionName: collectionName,
SearchPoints: []*qdrant.SearchPoints{codeSearch, textSearch},
})
if err != nil {
return nil, err
}
results := make([]ChunkResult, 0, params.CodeLimit+params.TextLimit)
for _, group := range resp.GetResult() {
for _, res := range group.GetResult() {
var cr ChunkResult
if err := cr.FromQdrantResult(res); err != nil {
return nil, err
}
results = append(results, cr)
}
}
return results, nil
}
func (db *qdrantDB) PrepareUpdate(ctx context.Context, modelID string, modelDims uint64) error {
return ensureModelCollectionWithDefaultConfig(ctx, db.collectionsClient, modelID, modelDims)
}
func (db *qdrantDB) HasIndex(ctx context.Context, modelID string, repoID api.RepoID, revision api.CommitID) (bool, error) {
resp, err := db.pointsClient.Scroll(ctx, &qdrant.ScrollPoints{
CollectionName: CollectionName(modelID),
Filter: &qdrant.Filter{
Must: []*qdrant.Condition{
repoIDCondition(repoID),
revisionCondition(revision),
},
},
Limit: pointers.Ptr(uint32(1)),
})
if err != nil {
return false, err
}
return len(resp.GetResult()) > 0, nil
}
type InsertParams struct {
ModelID string
ChunkPoints ChunkPoints
}
func (db *qdrantDB) InsertChunks(ctx context.Context, params InsertParams) error {
_, err := db.pointsClient.Upsert(ctx, &qdrant.UpsertPoints{
CollectionName: CollectionName(params.ModelID),
// Wait to avoid overloading the server
Wait: pointers.Ptr(true),
Points: params.ChunkPoints.ToQdrantPoints(),
Ordering: nil,
})
return err
}
type FinalizeUpdateParams struct {
ModelID string
RepoID api.RepoID
Revision api.CommitID
FilesToRemove []string
}
// TODO: document that this is idempotent and why it's important
func (db *qdrantDB) FinalizeUpdate(ctx context.Context, params FinalizeUpdateParams) error {
// First, delete the old files
err := db.deleteFiles(ctx, params)
if err != nil {
return err
}
// Then, update all the unchanged chunks to use the latest revision
err = db.updateRevisions(ctx, params)
if err != nil {
return err
}
return nil
}
func (db *qdrantDB) deleteFiles(ctx context.Context, params FinalizeUpdateParams) error {
// TODO: batch the deletes in case the file list is extremely large
filePathConditions := make([]*qdrant.Condition, len(params.FilesToRemove))
for i, path := range params.FilesToRemove {
filePathConditions[i] = filePathCondition(path)
}
_, err := db.pointsClient.Delete(ctx, &qdrant.DeletePoints{
CollectionName: CollectionName(params.ModelID),
Wait: pointers.Ptr(true), // wait until deleted before sending update
Ordering: &qdrant.WriteOrdering{Type: qdrant.WriteOrderingType_Strong},
Points: &qdrant.PointsSelector{
PointsSelectorOneOf: &qdrant.PointsSelector_Filter{
Filter: &qdrant.Filter{
// Only chunks for this repo
Must: []*qdrant.Condition{repoIDCondition(params.RepoID)},
// No chunks that are from the newest revision
MustNot: []*qdrant.Condition{revisionCondition(params.Revision)},
// Chunks that match at least one of the "to remove" filenames
Should: filePathConditions,
},
},
},
})
return err
}
func (db *qdrantDB) updateRevisions(ctx context.Context, params FinalizeUpdateParams) error {
_, err := db.pointsClient.SetPayload(ctx, &qdrant.SetPayloadPoints{
CollectionName: CollectionName(params.ModelID),
Wait: pointers.Ptr(true), // wait until deleted before sending update
Ordering: &qdrant.WriteOrdering{Type: qdrant.WriteOrderingType_Strong},
Payload: map[string]*qdrant.Value{
fieldRevision: {
Kind: &qdrant.Value_StringValue{
StringValue: string(params.Revision),
},
},
},
PointsSelector: &qdrant.PointsSelector{
PointsSelectorOneOf: &qdrant.PointsSelector_Filter{
Filter: &qdrant.Filter{
// Only chunks in this repo
Must: []*qdrant.Condition{repoIDCondition(params.RepoID)},
// Only chunks that are not already marked as part of this revision
MustNot: []*qdrant.Condition{revisionCondition(params.Revision)},
},
},
},
})
return err
}
func filePathCondition(path string) *qdrant.Condition {
return &qdrant.Condition{
ConditionOneOf: &qdrant.Condition_Field{
Field: &qdrant.FieldCondition{
Key: fieldFilePath,
Match: &qdrant.Match{
MatchValue: &qdrant.Match_Keyword{
Keyword: string(path),
},
},
},
},
}
}
func revisionCondition(revision api.CommitID) *qdrant.Condition {
return &qdrant.Condition{
ConditionOneOf: &qdrant.Condition_Field{
Field: &qdrant.FieldCondition{
Key: fieldRevision,
Match: &qdrant.Match{
MatchValue: &qdrant.Match_Keyword{
Keyword: string(revision),
},
},
},
},
}
}
func isCodeCondition(isCode bool) *qdrant.Condition {
return &qdrant.Condition{
ConditionOneOf: &qdrant.Condition_Field{
Field: &qdrant.FieldCondition{
Key: fieldIsCode,
Match: &qdrant.Match{
MatchValue: &qdrant.Match_Boolean{
Boolean: isCode,
},
},
},
},
}
}
func repoIDsConditions(ids []api.RepoID) []*qdrant.Condition {
conds := make([]*qdrant.Condition, len(ids))
for i, id := range ids {
conds[i] = repoIDCondition(id)
}
return conds
}
func repoIDCondition(repoID api.RepoID) *qdrant.Condition {
return &qdrant.Condition{
ConditionOneOf: &qdrant.Condition_Field{
Field: &qdrant.FieldCondition{
Key: fieldRepoID,
Match: &qdrant.Match{
MatchValue: &qdrant.Match_Integer{
Integer: int64(repoID),
},
},
},
},
}
}
// Select the full payload
var fullPayloadSelector = &qdrant.WithPayloadSelector{
SelectorOptions: &qdrant.WithPayloadSelector_Enable{
Enable: true,
},
}