@@ -40,7 +40,7 @@ import os
4040```
4141
4242``` python
43- _ = load_dotenv(find_dotenv())
43+ _ = load_dotenv(find_dotenv(), override = True )
4444service_url = os.environ[' TIMESCALE_SERVICE_URL' ]
4545```
4646
@@ -97,24 +97,24 @@ Now you can query for similar items:
9797await vec.search([1.0 , 9.0 ])
9898```
9999
100- [<Record id=UUID('e5dbaa7c-081b-4131-be18-c81ce47fc864 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>,
101- <Record id=UUID('2cdb8cbd-5dd7-4555-926a-5efafb4b1cf0 ') metadata={'animal': 'fox'} contents='the brown fox' embedding=array([1. , 1.3], dtype=float32) distance=0.14489260377438218>]
100+ [<Record id=UUID('d10dc66f-92d5-4296-a702-1690860bbe55 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>,
101+ <Record id=UUID('06153343-9085-4844-ad7a-b5cbed912053 ') metadata={'animal': 'fox'} contents='the brown fox' embedding=array([1. , 1.3], dtype=float32) distance=0.14489260377438218>]
102102
103103You can specify the number of records to return.
104104
105105``` python
106106await vec.search([1.0 , 9.0 ], limit = 1 )
107107```
108108
109- [<Record id=UUID('e5dbaa7c-081b-4131-be18-c81ce47fc864 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>]
109+ [<Record id=UUID('d10dc66f-92d5-4296-a702-1690860bbe55 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>]
110110
111111You can also specify a filter on the metadata as a simple dictionary
112112
113113``` python
114114await vec.search([1.0 , 9.0 ], limit = 1 , filter = {" action" : " jump" })
115115```
116116
117- [<Record id=UUID('e5dbaa7c-081b-4131-be18-c81ce47fc864 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>]
117+ [<Record id=UUID('d10dc66f-92d5-4296-a702-1690860bbe55 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>]
118118
119119You can also specify a list of filter dictionaries, where an item is
120120returned if it matches any dict
@@ -123,8 +123,8 @@ returned if it matches any dict
123123await vec.search([1.0 , 9.0 ], limit = 2 , filter = [{" action" : " jump" }, {" animal" : " fox" }])
124124```
125125
126- [<Record id=UUID('e5dbaa7c-081b-4131-be18-c81ce47fc864 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>,
127- <Record id=UUID('2cdb8cbd-5dd7-4555-926a-5efafb4b1cf0 ') metadata={'animal': 'fox'} contents='the brown fox' embedding=array([1. , 1.3], dtype=float32) distance=0.14489260377438218>]
126+ [<Record id=UUID('d10dc66f-92d5-4296-a702-1690860bbe55 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>,
127+ <Record id=UUID('06153343-9085-4844-ad7a-b5cbed912053 ') metadata={'animal': 'fox'} contents='the brown fox' embedding=array([1. , 1.3], dtype=float32) distance=0.14489260377438218>]
128128
129129You can access the fields as follows
130130
@@ -133,7 +133,7 @@ records = await vec.search([1.0, 9.0], limit=1, filter={"action": "jump"})
133133records[0 ][client.SEARCH_RESULT_ID_IDX ]
134134```
135135
136- UUID('e5dbaa7c-081b-4131-be18-c81ce47fc864 ')
136+ UUID('d10dc66f-92d5-4296-a702-1690860bbe55 ')
137137
138138``` python
139139records[0 ][client.SEARCH_RESULT_METADATA_IDX ]
@@ -293,6 +293,134 @@ search call:
293293rec = await vec.search([1.0 , 2.0 ], limit = 4 , uuid_time_filter = client.UUIDTimeRange(specific_datetime- timedelta(days = 7 ), specific_datetime+ timedelta(days = 7 )))
294294```
295295
296+ # PgVectorize
297+
298+ PgVectorize enables you to create vector embeddings from any data that
299+ you already have stored in Postgres. Simply, attach PgVectorize to any
300+ Postgres table, and it will automatically sync that table’s data with a
301+ set of embeddings stored in Timescale Vector. For example, let’s say you
302+ have a blog table defined in the following way:
303+
304+ ``` python
305+ import psycopg2
306+ from langchain.docstore.document import Document
307+ from langchain.text_splitter import CharacterTextSplitter
308+ from timescale_vector import client, pgvectorizer
309+ from langchain.embeddings.openai import OpenAIEmbeddings
310+ from langchain.vectorstores.timescalevector import TimescaleVector
311+ from datetime import timedelta
312+ ```
313+
314+ ``` python
315+ with psycopg2.connect(service_url) as conn:
316+ with conn.cursor() as cursor:
317+ cursor.execute('''
318+ CREATE TABLE IF NOT EXISTS blog (
319+ id SERIAL PRIMARY KEY NOT NULL,
320+ title TEXT NOT NULL,
321+ author TEXT NOT NULL,
322+ contents TEXT NOT NULL,
323+ category TEXT NOT NULL,
324+ published_time TIMESTAMPTZ NULL --NULL if not yet published
325+ );
326+ ''' )
327+ ```
328+
329+ You can insert some data as follows:
330+
331+ ``` python
332+ with psycopg2.connect(service_url) as conn:
333+ with conn.cursor() as cursor:
334+ cursor.execute('''
335+ INSERT INTO blog (title, author, contents, category, published_time) VALUES ('First Post', 'Matvey Arye', 'some super interesting content about cats.', 'AI', '2021-01-01');
336+ ''' )
337+ ```
338+
339+ Now, say you want to embed these blogs in Timescale Vector. First you
340+ need to define an ` embed_and_write ` function, that takes a set of blog
341+ posts, creates the embeddings, and writes them into TimescaleVector. For
342+ example, if using LangChain, it could look something like the following.
343+
344+ ``` python
345+ def get_document (blog ):
346+ text_splitter = CharacterTextSplitter(
347+ chunk_size = 1000 ,
348+ chunk_overlap = 200 ,
349+ )
350+ docs = []
351+ for chunk in text_splitter.split_text(blog[' contents' ]):
352+ content = f " Author { blog[' author' ]} , title: { blog[' title' ]} , contents: { chunk} "
353+ metadata = {
354+ " id" : str (client.uuid_from_time(blog[' published_time' ])),
355+ " blog_id" : blog[' id' ],
356+ " author" : blog[' author' ],
357+ " category" : blog[' category' ],
358+ " published_time" : blog[' published_time' ].isoformat(),
359+ }
360+ docs.append(Document(page_content = content, metadata = metadata))
361+ return docs
362+
363+ def embed_and_write (blog_instances , vectorizer ):
364+ embedding = OpenAIEmbeddings()
365+ vector_store = TimescaleVector(
366+ collection_name = " blog_embedding" ,
367+ service_url = service_url,
368+ embedding = embedding,
369+ time_partition_interval = timedelta(days = 30 ),
370+ )
371+
372+ # delete old embeddings for all ids in the work queue. locked_id is a special column that is set to the primary key of the table being
373+ # embedded. For items that are deleted, it is the only key that is set.
374+ metadata_for_delete = [{" blog_id" : blog[' locked_id' ]} for blog in blog_instances]
375+ vector_store.delete_by_metadata(metadata_for_delete)
376+
377+ documents = []
378+ for blog in blog_instances:
379+ # skip blogs that are not published yet, or are deleted (in which case it will be NULL)
380+ if blog[' published_time' ] != None :
381+ documents.extend(get_document(blog))
382+
383+ if len (documents) == 0 :
384+ return
385+
386+ texts = [d.page_content for d in documents]
387+ metadatas = [d.metadata for d in documents]
388+ ids = [d.metadata[" id" ] for d in documents]
389+ vector_store.add_texts(texts, metadatas, ids)
390+ ```
391+
392+ Then, all you have to do is run the following code in a scheduled job
393+ (cron job, lambda job, etc):
394+
395+ ``` python
396+ vectorizer = pgvectorizer.Vectorize(service_url, ' blog' )
397+ while vectorizer.process(embed_and_write) > 0 :
398+ pass
399+ ```
400+
401+ Every time that job runs it will sync the table with your embeddings. It
402+ will sync all insert, updates, and deletes to an embeddings table called
403+ ` blog_embedding ` .
404+
405+ Now, you can simply search the embeddings follows (again, using
406+ LangChain in the exampls):
407+
408+ ``` python
409+ embedding = OpenAIEmbeddings()
410+ vector_store = TimescaleVector(
411+ collection_name = " blog_embedding" ,
412+ service_url = service_url,
413+ embedding = embedding,
414+ time_partition_interval = timedelta(days = 30 ),
415+ )
416+
417+ res = vector_store.similarity_search_with_score(" Blogs about cats" )
418+ res
419+ ```
420+
421+ [(Document(page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.', metadata={'id': '4a784000-4bc4-11eb-9140-78a539e57b40', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00+00:00'}),
422+ 0.12605134378941762)]
423+
296424## Development
297425
298426This project is developed with [ nbdev] ( https://nbdev.fast.ai/ ) . Please
0 commit comments