반응형
    
    
    
  
from langchain.document_loaders import SitemapLoader, text
from langchain.text_splitter import RecursiveCharacterTextSplitter
from bs4 import BeautifulSoup
import streamlit as st
import asyncio
import sys
def parse_page(soup: BeautifulSoup):
    header = soup.find("header")
    footer = soup.find("footer")
    if header:
        header.decompose()
    if footer:
        footer.decompose()
    return (
        str(soup.get_text())
        .replace("\n", " ")
        .replace("\xa0", " ")
        .replace("CloseSearch Submit Blog", "")
    )
@st.cache_data(show_spinner="Loading website...")
def load_website(url):
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000,
        chunk_overlap=200,
    )
    loader = SitemapLoader(
        url,
        filter_urls=[
            r"^(.*\/notice\/).*",
        ],
        parsing_function=parse_page,
    )
    loader.requests_per_second = 2
    docs = loader.load_and_split(text_splitter=splitter)
    return docs
st.set_page_config(
    page_title="SiteGPT",
    page_icon="🖥️",
)
st.markdown(
    """
    # SiteGPT
    Ask questions about the content of a website.
    Start by writing the URL of the website on the sidebar.
"""
)
if 'win32' in sys.platform:
    # Windows specific event-loop policy & cmd
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    #asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
    cmds = [['C:/Windows/system32/HOSTNAME.EXE']]
else:
    # Unix default event-loop policy & cmds
    cmds = [
        ['du', '-sh', '/Users/fredrik/Desktop'],
        ['du', '-sh', '/Users/fredrik'],
        ['du', '-sh', '/Users/fredrik/Pictures']
    ]
with st.sidebar:
    url = st.text_input(
        "Write down a URL",
    )
if url:
    if ".xml" not in url:
        with st.sidebar:
            st.error("Please write down a Sitemap URL.")
    else:
        docs = load_website(url)
        st.write(docs)
사이트맵에서 가져온 정보에 대해 구문을 분석함
beautifulsoup을 이용하여 처리
Beautifulsoup.Tag.decompose() 
Tag.decompose()는 주어진 HTML 문서의 트리에서 태그를 제거한 다음 태그와 그 내용을 제거 
find() 
HTML 문서에서 해당 태그 하나를 가져옴
get_text() 
해당구문의 텍스트를 가져옴
'python' 카테고리의 다른 글
| [GPT][MEETINGGPT] Audio Extraction (0) | 2024.05.14 | 
|---|---|
| [GPT][SITEGPT] Map Re Rank Chain (0) | 2024.05.13 | 
| [GPT][SITEGPT] SitemapLoader (0) | 2024.05.13 | 
| [GPT][SITEGPT] AsyncChromiumLoader (0) | 2024.05.02 | 
| [GPT][QUIZGPT]Output Parser 를 이용한 데이터 형태 제어 (1) | 2024.04.18 |