반응형
from langchain.document_loaders import SitemapLoader
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from bs4 import BeautifulSoup
import streamlit as st
import asyncio
import sys
llm = ChatOpenAI(
temperature=0.1,
)
answers_prompt = ChatPromptTemplate.from_template(
"""
Using ONLY the following context answer the user's question. If you can't just say you don't know, don't make anything up.
Then, give a score to the answer between 0 and 5.
If the answer answers the user question the score should be high, else it should be low.
Make sure to always include the answer's score even if it's 0.
Context: {context}
Examples:
Question: How far away is the moon?
Answer: The moon is 384,400 km away.
Score: 5
Question: How far away is the sun?
Answer: I don't know
Score: 0
Your turn!
Question: {question}
"""
)
def get_answers(inputs):
docs = inputs["docs"]
question = inputs["question"]
answers_chain = answers_prompt | llm
print(docs)
return {
"question": question,
"answers": [
{
"answer": answers_chain.invoke(
{"question": question, "context": doc.page_content}
).content,
"source": doc.metadata["source"],
}
for doc in docs
],
}
def choose_answer(inputs):
answers = inputs["answers"]
question = inputs["question"]
choose_chain = choose_prompt | llm
condensed = "\n\n".join(
f"{answer['answer']}\nSource:{answer['source']}\n"
for answer in answers
)
return choose_chain.invoke(
{
"question": question,
"answers": condensed,
}
)
choose_prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"""
Use ONLY the following pre-existing answers to answer the user's question.
Use the answers that have the highest score (more helpful) and favor the most recent ones.
Cite sources and return the sources of the answers as they are, do not change them.
Answers: {answers}
""",
),
("human", "{question}"),
]
)
def parse_page(soup: BeautifulSoup):
header = soup.find("header")
footer = soup.find("footer")
if header:
header.decompose()
if footer:
footer.decompose()
return (
str(soup.get_text())
.replace("\n", " ")
.replace("\xa0", " ")
.replace("CloseSearch Submit Blog", "")
)
@st.cache_data(show_spinner="Loading website...")
def load_website(url):
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=1000,
chunk_overlap=200,
)
loader = SitemapLoader(
url,
parsing_function=parse_page,
)
loader.requests_per_second = 2
docs = loader.load_and_split(text_splitter=splitter)
vector_store = FAISS.from_documents(docs, OpenAIEmbeddings())
return vector_store.as_retriever()
st.set_page_config(
page_title="SiteGPT",
page_icon="🖥️",
)
st.markdown(
"""
# SiteGPT
Ask questions about the content of a website.
Start by writing the URL of the website on the sidebar.
"""
)
if 'win32' in sys.platform:
# Windows specific event-loop policy & cmd
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
#asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
cmds = [['C:/Windows/system32/HOSTNAME.EXE']]
else:
# Unix default event-loop policy & cmds
cmds = [
['du', '-sh', '/Users/fredrik/Desktop'],
['du', '-sh', '/Users/fredrik'],
['du', '-sh', '/Users/fredrik/Pictures']
]
with st.sidebar:
url = st.text_input(
"Write down a URL",
)
if url:
if ".xml" not in url:
with st.sidebar:
st.error("Please write down a Sitemap URL.")
else:
retriever = load_website(url)
query = st.text_input("Ask a question to the website.")
if query:
chain = (
{
"docs": retriever,
"question": RunnablePassthrough(),
}
| RunnableLambda(get_answers)
| RunnableLambda(choose_answer)
)
result = chain.invoke(query)
st.markdown(result.content.replace("$", "\$"))
Map re-rank
문서들마다 각각의 답변을 만들어 내고, 이에 점수를 부여
가장 높은 점수를 받은 답변을 최종 답변으로 채택함
문서가 구분되어 있고 높은 유사성을 요구할 때 유용
answers_prompt 의 내용을 확인하면 예시가 존재함
'python' 카테고리의 다른 글
[GPT][MEETINGGPT] Cutting The Audio (0) | 2024.05.14 |
---|---|
[GPT][MEETINGGPT] Audio Extraction (0) | 2024.05.14 |
[GPT][SITEGPT] Parsing Function (0) | 2024.05.13 |
[GPT][SITEGPT] SitemapLoader (0) | 2024.05.13 |
[GPT][SITEGPT] AsyncChromiumLoader (0) | 2024.05.02 |