插入数据后的效果:
代码如下:
import configparser from pymilvus import connections, Collection, DataType, FieldSchema, CollectionSchema import numpy as npdef create_collection():# Define the schemafields = [FieldSchema(name="sentence_id", dtype=DataType.INT64, is_primary=True, auto_id=True),FieldSchema(name="sentence", dtype=DataType.VARCHAR, max_length=512),FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128)]schema = CollectionSchema(fields, description="Sentence collection")# Create the collectioncollection = Collection(name="sentence_collection", schema=schema)return collectiondef insert_data(collection):sentences = ["这是第一句。","这是第二句。","这是第三句。"]embeddings = np.random.rand(len(sentences), 128).tolist() # Generate 128-dimensional vectorsentities = [sentences,embeddings]insert_result = collection.insert(entities)print(f"Inserted {len(insert_result.primary_keys)} records into collection.")def create_index(collection):index_params = {"index_type": "IVF_FLAT","params": {"nlist": 128},"metric_type": "L2"}collection.create_index(field_name="embedding", index_params=index_params)print("Index created.")def search_data(collection, query_sentence):query_embedding = np.random.rand(1, 128).tolist() # Generate a vector for the query sentencesearch_params = {"metric_type": "L2", "params": {"nprobe": 10}}results = collection.search(data=query_embedding,anns_field="embedding",param=search_params,limit=3,expr=None,output_fields=["sentence"])for hits in results:for hit in hits:print(f"Match found: {hit.id} with distance: {hit.distance}, sentence: {hit.entity.get('sentence')}")if __name__ == '__main__':# Connect to Milvuscfp = configparser.RawConfigParser()cfp.read('config.ini')milvus_uri = cfp.get('example', 'uri')token = cfp.get('example', 'token')connections.connect("default",uri=milvus_uri,token=token)print(f"Connecting to DB: {milvus_uri}")# Create collectioncollection = create_collection()# Insert datainsert_data(collection)# Create indexcreate_index(collection)# Load the collection into memorycollection.load()# Search datasearch_data(collection, "这是一个查询句子。")
运行效果:
python hello_zilliz_vectordb.py
Connecting to DB: https://in03-ca69f49bb65709f.api.gcp-us-west1.zillizcloud.com
Inserted 3 records into collection.
Index created.
Match found: 450140263656791260 with distance: 19.557846069335938, sentence: 这是第二句。
Match found: 450140263656791261 with distance: 20.327802658081055, sentence: 这是第三句。
Match found: 450140263656791259 with distance: 20.40052032470703, sentence: 这是第一句。
注意事项:
- 向量转换:上面的代码使用了随机向量来模拟句子向量。在实际应用中,您需要使用 NLP 模型(例如中文 BERT)来将中文句子转换为向量。
- 字符编码:确保在读取和处理中文文本时使用正确的字符编码(通常是 UTF-8)。