import openai
from openai import OpenAI

MY_API_KEY = ''
MY_API_URL = "https://api.zhizengzeng.com/v1"

client = OpenAI(
    api_key=MY_API_KEY,
    base_url=MY_API_URL
)

# 设置embedding的模型和相关参数
embedding_model = "text-embedding-3-small"  # 使用的文本嵌入模型
embedding_encoding = "cl100k_base"  # 对应的编码方案
max_tokens = 8000  # 每段文字的最大token数

response = client.embeddings.create(
    input="I have a dream",
    model=embedding_model
)

print(len(response.data[0].embedding))
print(response.data[0].embedding[:10])

1536
[0.006279389839619398, -0.032659903168678284, -0.05116906762123108, 0.009919666685163975, -0.014348847791552544, -0.02665998972952366, 0.02361758053302765, 0.0636783167719841, -0.029207121580839157, -0.036933425813913345]

import pandas as pd

# 读取文本文件，每一行作为一个记录
df = pd.read_csv("train.txt", header=None)

# 给列命名（可选）
df.columns = ["text"]

# 打印 DataFrame
print(df)

                         text
0               LGX is a man.
1  LGX was born on the Earth.
2          LGX is from China.

import tiktoken

# 获取指定的编码方案
encoding = tiktoken.get_encoding(embedding_encoding) 

# 计算每条文本的token数量，并将其存储在新的列 "n_tokens" 中
df["n_tokens"] = df["text"].apply(lambda x: len(encoding.encode(x)))

# 过滤token数量小于等于max_tokens的文本，只保留最后top_n条
top_n = 100 
df = df[df.n_tokens <= max_tokens].tail(top_n)

# 输出最终的DataFrame
df

# # 调用API生成文本的embedding
def get_embedding(text, model=embedding_model):
   text = text.replace("\n", " ")
   return client.embeddings.create(input=[text], model=model).data[0].embedding

# 对DataFrame的每条Text文本进行embedding转换
df['embedding'] = df["text"].apply(lambda x: get_embedding(x, model=embedding_model))

# 输出DataFrame
df

from typing import List, Tuple
from scipy import spatial
import pandas as pd

# 根据查询返回与知识库中句子的相似度排行
def strings_ranked_by_relatedness(
    query: str,  # 查询字符串
    df: pd.DataFrame,  # 包含文本和嵌入的DataFrame
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),  # 计算余弦相似度
    top_n: int = 100  # 返回的前N个相似结果
) -> Tuple[List[str], List[float]]:  # 返回一个元组，包含字符串列表和相似度列表
    """
    返回一个按照相似度从高到低排序的字符串列表及其对应的相似度。
    """
    
    # 向API请求生成查询字符串的embedding
    query_embedding_response = client.embeddings.create(
        model=embedding_model,  # 使用的嵌入模型
        input=query,  # 查询字符串作为输入
    )
    
    # 提取查询字符串的embedding（向量表示）
    query_embedding = query_embedding_response.data[0].embedding
    
    # 计算DataFrame中每个句子与查询句子的相似度
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))  # 计算每个文本的相似度
        for i, row in df.iterrows()  # 遍历DataFrame的每一行
    ]
    
    # 按照相似度从高到低排序
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    
    # 将排序后的文本和相似度分别拆分成两个列表
    strings, relatednesses = zip(*strings_and_relatednesses)
    
    # 返回前top_n个相似的文本和相似度
    return list(strings[:top_n]), list(relatednesses[:top_n])

# 测试找相关句子
strings, relatednesses = strings_ranked_by_relatedness("is LGX a man or a woman?", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.763

'LGX is a man.'

relatedness=0.557

'LGX was born on the Earth.'

relatedness=0.551

'LGX is from China.'

# 测试找无关句子
strings, relatednesses = strings_ranked_by_relatedness("a box of cholocates", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.168

'LGX is a man.'

relatedness=0.162

'LGX is from China.'

relatedness=0.092

'LGX was born on the Earth.'

# 定义使用的 GPT 模型，默认使用 gpt-3.5-turbo
# Define the GPT model to be used, defaulting to gpt-3.5-turbo
GPT_MODEL = "gpt-3.5-turbo"

# 计算文本的 token 数量
# Function to calculate the number of tokens in a given text for a specific model
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """返回字符串中的 token 数量
    Return the number of tokens in a string.
    """
    # 使用 tiktoken 库获取指定模型的编码方式
    # Use tiktoken to get the encoding for the specified model
    encoding = tiktoken.encoding_for_model(model)
    
    # 对文本进行编码并返回 token 数量
    # Encode the text and return the length, which represents the number of tokens
    return len(encoding.encode(text))


# 构建 GPT 消息，包含从 DataFrame 中提取的相关文本
# Function to build a message for GPT, including relevant texts pulled from a dataframe
def query_message(
    query: str,               # 用户查询 / User query
    df: pd.DataFrame,         # 包含相关文本的 DataFrame / DataFrame containing relevant texts
    model: str,               # 使用的 GPT 模型 / GPT model to be used
    token_budget: int         # token 限制 / Token budget (the maximum number of tokens allowed)
) -> str:
    """返回一个用于 GPT 的消息，包含从 DataFrame 中提取的相关文本。
    Return a message for GPT with relevant source texts pulled from a dataframe.
    """
    # 根据相关性对字符串进行排序
    # Rank the strings by their relatedness to the query
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    
    # 消息开头的介绍部分，告诉 GPT 如何使用文章回答问题
    # Introduction telling GPT how to use the articles to answer the question
    introduction = 'Use the articles about LGX to answer the subsequent question. If the answer cannot be found, write "I could not find an answer."'
    
    # 将用户的查询作为问题加入到消息的结尾
    # Append the user's query as the question at the end of the message
    question = f"\n\nQuestion: {query}"
    
    # 初始化消息内容为引言部分
    # Initialize the message with the introduction
    message = introduction
    
    # 循环遍历相关的文本段落
    # Loop through the relevant strings (texts)
    for string in strings:
        # 构建下一个文章段落的格式
        # Format the next article section to be added to the message
        next_article = f'\n\db information section:\n"""\n{string}\n"""'
        
        # 计算如果加入这个段落和问题后，token 数量是否超过预算
        # Check if adding this article section and the question would exceed the token budget
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            # 如果超过预算，则停止添加更多段落
            # If it exceeds the budget, stop adding more sections
            break
        else:
            # 否则将段落加入消息中
            # Otherwise, add the article section to the message
            message += next_article
    
    # 返回完整的消息，包含所有加入的段落和问题
    # Return the complete message, including all the added sections and the question
    return message + question


# 使用 GPT 和 DataFrame 中的相关文本回答问题
# Function to answer a query using GPT and a dataframe of relevant texts and embeddings
def ask(
    query: str,               # 用户查询 / User query
    df: pd.DataFrame = df,    # 包含嵌入的 DataFrame / DataFrame containing text and embeddings
    model: str = GPT_MODEL,   # 使用的 GPT 模型 / GPT model to be used
    token_budget: int = 4096 - 500,  # token 上限，预留 500 token 作为回复的空间 / Token budget, leaving 500 tokens for the response
    print_message: bool = False,     # 是否打印生成的消息 / Whether to print the generated message
) -> str:
    """使用 GPT 和包含相关文本的 DataFrame 回答查询。
    Answers a query using GPT and a dataframe of relevant texts and embeddings.
    """
    # 生成用于 GPT 的消息
    # Generate the message for GPT
    message = query_message(query, df, model=model, token_budget=token_budget)
    
    # 如果需要，打印生成的消息
    # Print the generated message if requested
    if print_message:
        print(message)
    
    # 构建向 GPT 提问时的完整消息
    # Construct the complete message for querying GPT
    messages = [
        {"role": "system", "content": "You only answer questions about LGX."},  # 系统角色的描述信息 / System role description
        {"role": "user", "content": message},  # 用户的提问内容 / User's query
    ]
    
    # 使用 GPT 模型生成回答
    # Use the GPT model to generate a response
    response = client.chat.completions.create(
        model=model,           # 使用的 GPT 模型 / GPT model to be used
        messages=messages,     # 消息列表 / List of messages
        temperature=0          # 设置生成温度为 0，确保结果更具确定性 / Temperature set to 0 for more deterministic results
    )
    
    # 提取 GPT 返回的回答内容
    # Extract the response content returned by GPT
    response_message = response.choices[0].message.content
    
    # 返回 GPT 的回答
    # Return the response from GPT
    return response_message

print(ask('What planet is LGX on?'))
print(ask('Is LGX a man or woman?'))
print(ask('Is LGX a male or female?'))
print(ask('Which country is LGX from?'))
print(ask('Who is LGX?'))
print(ask('Who are you?'))

LGX is on Earth.
LGX is a man.
LGX is a male.
LGX is from China.
LGX is a man from China who was born on Earth.
I could not find an answer.

print(ask('Tell me something about LGX.'))
print(ask('说说LGX'))
print(ask('你哪位?'))

LGX is a man from China.
LGX 是一个男人，来自中国，出生在地球上。
LGX。

print(ask('Who are you?'))
print(ask('are you ok?'))
print(ask('你好'))

I could not find an answer.
I could not find an answer.
你好! LGX is a man from China who was born on Earth.

	text	n_tokens
0	LGX is a man.	6
1	LGX was born on the Earth.	8
2	LGX is from China.	6

	text	n_tokens	embedding
0	LGX is a man.	6	[0.013989169150590897, 0.009273948147892952, -...
1	LGX was born on the Earth.	8	[0.0102503951638937, 0.026698702946305275, 0.0...
2	LGX is from China.	6	[-0.013337323442101479, -0.011200105771422386,...