In [2]:
import openai
from openai import OpenAI
MY_API_KEY = ''
MY_API_URL = "https://api.zhizengzeng.com/v1"
In [3]:
client = OpenAI(
api_key=MY_API_KEY,
base_url=MY_API_URL
)
# 设置embedding的模型和相关参数
embedding_model = "text-embedding-3-small" # 使用的文本嵌入模型
embedding_encoding = "cl100k_base" # 对应的编码方案
max_tokens = 8000 # 每段文字的最大token数
In [11]:
response = client.embeddings.create(
input="I have a dream",
model=embedding_model
)
print(len(response.data[0].embedding))
print(response.data[0].embedding[:10])
1536 [0.006279389839619398, -0.032659903168678284, -0.05116906762123108, 0.009919666685163975, -0.014348847791552544, -0.02665998972952366, 0.02361758053302765, 0.0636783167719841, -0.029207121580839157, -0.036933425813913345]
In [85]:
import pandas as pd
# 读取文本文件,每一行作为一个记录
df = pd.read_csv("train.txt", header=None)
# 给列命名(可选)
df.columns = ["text"]
# 打印 DataFrame
print(df)
text 0 LGX is a man. 1 LGX was born on the Earth. 2 LGX is from China.
In [86]:
import tiktoken
# 获取指定的编码方案
encoding = tiktoken.get_encoding(embedding_encoding)
# 计算每条文本的token数量,并将其存储在新的列 "n_tokens" 中
df["n_tokens"] = df["text"].apply(lambda x: len(encoding.encode(x)))
# 过滤token数量小于等于max_tokens的文本,只保留最后top_n条
top_n = 100
df = df[df.n_tokens <= max_tokens].tail(top_n)
# 输出最终的DataFrame
df
Out[86]:
text | n_tokens | |
---|---|---|
0 | LGX is a man. | 6 |
1 | LGX was born on the Earth. | 8 |
2 | LGX is from China. | 6 |
In [123]:
# # 调用API生成文本的embedding
def get_embedding(text, model=embedding_model):
text = text.replace("\n", " ")
return client.embeddings.create(input=[text], model=model).data[0].embedding
# 对DataFrame的每条Text文本进行embedding转换
df['embedding'] = df["text"].apply(lambda x: get_embedding(x, model=embedding_model))
# 输出DataFrame
df
Out[123]:
text | n_tokens | embedding | |
---|---|---|---|
0 | LGX is a man. | 6 | [0.013989169150590897, 0.009273948147892952, -... |
1 | LGX was born on the Earth. | 8 | [0.0102503951638937, 0.026698702946305275, 0.0... |
2 | LGX is from China. | 6 | [-0.013337323442101479, -0.011200105771422386,... |
In [124]:
from typing import List, Tuple
from scipy import spatial
import pandas as pd
# 根据查询返回与知识库中句子的相似度排行
def strings_ranked_by_relatedness(
query: str, # 查询字符串
df: pd.DataFrame, # 包含文本和嵌入的DataFrame
relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y), # 计算余弦相似度
top_n: int = 100 # 返回的前N个相似结果
) -> Tuple[List[str], List[float]]: # 返回一个元组,包含字符串列表和相似度列表
"""
返回一个按照相似度从高到低排序的字符串列表及其对应的相似度。
"""
# 向API请求生成查询字符串的embedding
query_embedding_response = client.embeddings.create(
model=embedding_model, # 使用的嵌入模型
input=query, # 查询字符串作为输入
)
# 提取查询字符串的embedding(向量表示)
query_embedding = query_embedding_response.data[0].embedding
# 计算DataFrame中每个句子与查询句子的相似度
strings_and_relatednesses = [
(row["text"], relatedness_fn(query_embedding, row["embedding"])) # 计算每个文本的相似度
for i, row in df.iterrows() # 遍历DataFrame的每一行
]
# 按照相似度从高到低排序
strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
# 将排序后的文本和相似度分别拆分成两个列表
strings, relatednesses = zip(*strings_and_relatednesses)
# 返回前top_n个相似的文本和相似度
return list(strings[:top_n]), list(relatednesses[:top_n])
In [125]:
# 测试找相关句子
strings, relatednesses = strings_ranked_by_relatedness("is LGX a man or a woman?", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
print(f"{relatedness=:.3f}")
display(string)
relatedness=0.763
'LGX is a man.'
relatedness=0.557
'LGX was born on the Earth.'
relatedness=0.551
'LGX is from China.'
In [126]:
# 测试找无关句子
strings, relatednesses = strings_ranked_by_relatedness("a box of cholocates", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
print(f"{relatedness=:.3f}")
display(string)
relatedness=0.168
'LGX is a man.'
relatedness=0.162
'LGX is from China.'
relatedness=0.092
'LGX was born on the Earth.'
In [133]:
# 定义使用的 GPT 模型,默认使用 gpt-3.5-turbo
# Define the GPT model to be used, defaulting to gpt-3.5-turbo
GPT_MODEL = "gpt-3.5-turbo"
# 计算文本的 token 数量
# Function to calculate the number of tokens in a given text for a specific model
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
"""返回字符串中的 token 数量
Return the number of tokens in a string.
"""
# 使用 tiktoken 库获取指定模型的编码方式
# Use tiktoken to get the encoding for the specified model
encoding = tiktoken.encoding_for_model(model)
# 对文本进行编码并返回 token 数量
# Encode the text and return the length, which represents the number of tokens
return len(encoding.encode(text))
# 构建 GPT 消息,包含从 DataFrame 中提取的相关文本
# Function to build a message for GPT, including relevant texts pulled from a dataframe
def query_message(
query: str, # 用户查询 / User query
df: pd.DataFrame, # 包含相关文本的 DataFrame / DataFrame containing relevant texts
model: str, # 使用的 GPT 模型 / GPT model to be used
token_budget: int # token 限制 / Token budget (the maximum number of tokens allowed)
) -> str:
"""返回一个用于 GPT 的消息,包含从 DataFrame 中提取的相关文本。
Return a message for GPT with relevant source texts pulled from a dataframe.
"""
# 根据相关性对字符串进行排序
# Rank the strings by their relatedness to the query
strings, relatednesses = strings_ranked_by_relatedness(query, df)
# 消息开头的介绍部分,告诉 GPT 如何使用文章回答问题
# Introduction telling GPT how to use the articles to answer the question
introduction = 'Use the articles about LGX to answer the subsequent question. If the answer cannot be found, write "I could not find an answer."'
# 将用户的查询作为问题加入到消息的结尾
# Append the user's query as the question at the end of the message
question = f"\n\nQuestion: {query}"
# 初始化消息内容为引言部分
# Initialize the message with the introduction
message = introduction
# 循环遍历相关的文本段落
# Loop through the relevant strings (texts)
for string in strings:
# 构建下一个文章段落的格式
# Format the next article section to be added to the message
next_article = f'\n\db information section:\n"""\n{string}\n"""'
# 计算如果加入这个段落和问题后,token 数量是否超过预算
# Check if adding this article section and the question would exceed the token budget
if (
num_tokens(message + next_article + question, model=model)
> token_budget
):
# 如果超过预算,则停止添加更多段落
# If it exceeds the budget, stop adding more sections
break
else:
# 否则将段落加入消息中
# Otherwise, add the article section to the message
message += next_article
# 返回完整的消息,包含所有加入的段落和问题
# Return the complete message, including all the added sections and the question
return message + question
# 使用 GPT 和 DataFrame 中的相关文本回答问题
# Function to answer a query using GPT and a dataframe of relevant texts and embeddings
def ask(
query: str, # 用户查询 / User query
df: pd.DataFrame = df, # 包含嵌入的 DataFrame / DataFrame containing text and embeddings
model: str = GPT_MODEL, # 使用的 GPT 模型 / GPT model to be used
token_budget: int = 4096 - 500, # token 上限,预留 500 token 作为回复的空间 / Token budget, leaving 500 tokens for the response
print_message: bool = False, # 是否打印生成的消息 / Whether to print the generated message
) -> str:
"""使用 GPT 和包含相关文本的 DataFrame 回答查询。
Answers a query using GPT and a dataframe of relevant texts and embeddings.
"""
# 生成用于 GPT 的消息
# Generate the message for GPT
message = query_message(query, df, model=model, token_budget=token_budget)
# 如果需要,打印生成的消息
# Print the generated message if requested
if print_message:
print(message)
# 构建向 GPT 提问时的完整消息
# Construct the complete message for querying GPT
messages = [
{"role": "system", "content": "You only answer questions about LGX."}, # 系统角色的描述信息 / System role description
{"role": "user", "content": message}, # 用户的提问内容 / User's query
]
# 使用 GPT 模型生成回答
# Use the GPT model to generate a response
response = client.chat.completions.create(
model=model, # 使用的 GPT 模型 / GPT model to be used
messages=messages, # 消息列表 / List of messages
temperature=0 # 设置生成温度为 0,确保结果更具确定性 / Temperature set to 0 for more deterministic results
)
# 提取 GPT 返回的回答内容
# Extract the response content returned by GPT
response_message = response.choices[0].message.content
# 返回 GPT 的回答
# Return the response from GPT
return response_message
In [132]:
print(ask('What planet is LGX on?'))
print(ask('Is LGX a man or woman?'))
print(ask('Is LGX a male or female?'))
print(ask('Which country is LGX from?'))
print(ask('Who is LGX?'))
print(ask('Who are you?'))
LGX is on Earth. LGX is a man. LGX is a male. LGX is from China. LGX is a man from China who was born on Earth. I could not find an answer.
In [135]:
print(ask('Tell me something about LGX.'))
print(ask('说说LGX'))
print(ask('你哪位?'))
LGX is a man from China. LGX 是一个男人,来自中国,出生在地球上。 LGX。
In [137]:
print(ask('Who are you?'))
print(ask('are you ok?'))
print(ask('你好'))
I could not find an answer. I could not find an answer. 你好! LGX is a man from China who was born on Earth.