Bases: BaseWebScraperDriver
Source code in griptape/drivers/web_scraper/trafilatura_web_scraper_driver.py
| @define
class TrafilaturaWebScraperDriver(BaseWebScraperDriver):
include_links: bool = field(default=True, kw_only=True)
no_ssl: bool = field(default=False, kw_only=True)
def fetch_url(self, url: str) -> str:
trafilatura = import_optional_dependency("trafilatura")
use_config = trafilatura.settings.use_config
config = use_config()
page = trafilatura.fetch_url(url, no_ssl=self.no_ssl)
# This disables signal, so that trafilatura can work on any thread:
# More info: https://trafilatura.readthedocs.io/usage-python.html#disabling-signal
config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
# Disable error logging in trafilatura as it sometimes logs errors from lxml, even though
# the end result of page parsing is successful.
logging.getLogger("trafilatura").setLevel(logging.FATAL)
if page is None:
raise Exception("can't access URL")
return page
def extract_page(self, page: str) -> TextArtifact:
trafilatura = import_optional_dependency("trafilatura")
use_config = trafilatura.settings.use_config
config = use_config()
extracted_page = trafilatura.extract(
page,
include_links=self.include_links,
output_format="json",
config=config,
)
if not extracted_page:
raise Exception("can't extract page")
text = json.loads(extracted_page).get("text")
return TextArtifact(text)
|
include_links: bool = field(default=True, kw_only=True)
class-attribute
instance-attribute
no_ssl: bool = field(default=False, kw_only=True)
class-attribute
instance-attribute
Source code in griptape/drivers/web_scraper/trafilatura_web_scraper_driver.py
| def extract_page(self, page: str) -> TextArtifact:
trafilatura = import_optional_dependency("trafilatura")
use_config = trafilatura.settings.use_config
config = use_config()
extracted_page = trafilatura.extract(
page,
include_links=self.include_links,
output_format="json",
config=config,
)
if not extracted_page:
raise Exception("can't extract page")
text = json.loads(extracted_page).get("text")
return TextArtifact(text)
|
fetch_url(url)
Source code in griptape/drivers/web_scraper/trafilatura_web_scraper_driver.py
| def fetch_url(self, url: str) -> str:
trafilatura = import_optional_dependency("trafilatura")
use_config = trafilatura.settings.use_config
config = use_config()
page = trafilatura.fetch_url(url, no_ssl=self.no_ssl)
# This disables signal, so that trafilatura can work on any thread:
# More info: https://trafilatura.readthedocs.io/usage-python.html#disabling-signal
config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
# Disable error logging in trafilatura as it sometimes logs errors from lxml, even though
# the end result of page parsing is successful.
logging.getLogger("trafilatura").setLevel(logging.FATAL)
if page is None:
raise Exception("can't access URL")
return page
|