import logging
from io import BytesIO
import tos
from urllib.parse import quote_plus
import pymongo# 设置日志格式
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')# MongoDB 连接配置
# mongo_uri = "mongodb://localhost:27017"
db_name = "spider"
collection_name = "analyzed_books"# TOS 连接配置
ak = 'AKLTMjI1MGJmMjAwN2Q1NGQyN2EzODM3NjFiMTcyNTgzNzU'
sk = 'WWpkaFpEUXhZek15WWpBeU5EWXhOemt5WWpFeU9XRTNNVFl6TXpGaVptUQ=='
endpoint = "tos-cn-beijing.volces.com"
region = "cn-beijing"
bucket_name = "livein-origin-data"# 连接到 MongoDB
client = pymongo.MongoClient(mongo_uri)
db = client[db_name]
collection = db[collection_name]# 获取所有文档
results = list(collection.find())print(f'analyzed_books需要修改的数据数量:{len(list(results))}')
for doc in results:print(doc)book_id = doc['_id']ocr_content = doc['ocr_content']# 创建一个 BytesIO 对象,用于模拟文件内容content = BytesIO(ocr_content.encode('utf-8'))# 生成 TOS 对象的键object_key = f"books/{book_id}.md"# 创建 TOS 客户端client = tos.TosClientV2(ak, sk, endpoint, region)try:# 上传到 TOSclient.put_object(bucket_name, object_key, content=content)# 生成 TOS 对象的公共链接tos_url = f"https://{bucket_name}.{endpoint}/{object_key}"# 更新 MongoDB 文档,将 ocr_content 替换为 TOS 链接collection.update_one({'_id': book_id}, {'$set': {'ocr_content': tos_url}})logging.info(f"更新后的数据: {tos_url}\n")except Exception as e:logging.error(f"更新失败: {e}")