Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion v7/import_page/README.md → v8/import_page/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Plugin to import arbitrary web pages.
Plugin to import arbitrary web pages (from an URL or a local file).

Usage:

Expand Down
File renamed without changes.
File renamed without changes.
75 changes: 46 additions & 29 deletions v7/import_page/import_page.py → v8/import_page/import_page.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

# Copyright © 2015 Roberto Alsina and others
# Copyright © 2025 Roberto Alsina and others

# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
Expand Down Expand Up @@ -29,10 +29,9 @@
import codecs

try:
import libextract.api
from bs4 import BeautifulSoup
except ImportError:
libextract = None
import lxml.html
BeautifulSoup = None
import requests
import sys

Expand Down Expand Up @@ -61,34 +60,52 @@ class CommandImportPage(Command):

def _execute(self, options, args):
"""Import a Page."""
if libextract is None:
utils.req_missing(['libextract'], 'use the import_page plugin')
if BeautifulSoup is None:
utils.req_missing(['bs4'], 'use the import_page plugin')
for url in args:
self._import_page(url)

def _import_page(self, url):
r = requests.get(url)
if 199 < r.status_code < 300: # Got it
# Use the page's title
doc = lxml.html.fromstring(r.content)
title = doc.find('*//title').text
if sys.version_info[0] == 2 and isinstance(title, str):
title = title.decode('utf-8')
parse = requests.utils.urlparse(url)
if 'http' in parse.scheme:
r = requests.get(url)
if not (199 < r.status_code < 300): # Did not get it
LOGGER.error(f'Error fetching URL: {url}')
return 1
html = r.content.decode(r.encoding).encode('utf-8') if r.encoding and 'utf-8' \
not in r.encoding.lower() else r.content
Comment thread
jirib marked this conversation as resolved.
Outdated
else:
try:
slug = utils.slugify(title, lang='')
except TypeError:
slug = utils.slugify(title)
nodes = list(libextract.api.extract(r.content))
# Let's assume the node with more text is the good one
lengths = [len(n.text_content()) for n in nodes]
node = nodes[lengths.index(max(lengths))]
document = doc_template.format(
title=title,
slug=slug,
content=lxml.html.tostring(node, encoding='utf8', method='html', pretty_print=True).decode('utf8')
)
with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf:
outf.write(document)

with open(url, 'rb') as f:
html = f.read()
except FileNotFoundError:
LOGGER.error(f'Error file does not exist: {url}')
return 1
except (OSError, IOError) as e:
LOGGER.error(f'Error opening file "{url}": {e}')
return 1

try:
soup = BeautifulSoup(html, "lxml")
except ImportError:
soup = BeautifulSoup(html, "html.parser")

title = soup.title.text if soup.title else "Untitled Page"
try:
slug = utils.slugify(title, lang='')
except TypeError:
slug = utils.slugify(title)

candidates = soup.find_all(["p", "div", "article", "section"])
if candidates:
node = max(candidates, key=lambda n: len(n.get_text(strip=True)))
else:
LOGGER.error('Error fetching URL: {}'.format(url))
node = None # empty

document = doc_template.format(
title=title,
slug=slug,
content=node.get_text(strip=True)
)
with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf:
outf.write(document)
File renamed without changes.