Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion v7/import_page/README.md → v8/import_page/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Plugin to import arbitrary web pages.
Plugin to import arbitrary web pages (from an URL or a local file).

Usage:

Expand Down
File renamed without changes.
File renamed without changes.
74 changes: 45 additions & 29 deletions v7/import_page/import_page.py → v8/import_page/import_page.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

# Copyright © 2015 Roberto Alsina and others
# Copyright © 2025 Roberto Alsina and others

# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
Expand Down Expand Up @@ -29,10 +29,9 @@
import codecs

try:
import libextract.api
from bs4 import BeautifulSoup
except ImportError:
libextract = None
import lxml.html
BeautifulSoup = None
import requests
import sys

Expand Down Expand Up @@ -61,34 +60,51 @@ class CommandImportPage(Command):

def _execute(self, options, args):
"""Import a Page."""
if libextract is None:
utils.req_missing(['libextract'], 'use the import_page plugin')
if BeautifulSoup is None:
utils.req_missing(['bs4'], 'use the import_page plugin')
for url in args:
self._import_page(url)

def _import_page(self, url):
r = requests.get(url)
if 199 < r.status_code < 300: # Got it
# Use the page's title
doc = lxml.html.fromstring(r.content)
title = doc.find('*//title').text
if sys.version_info[0] == 2 and isinstance(title, str):
title = title.decode('utf-8')
parse = requests.utils.urlparse(url)
if 'http' in parse.scheme:
r = requests.get(url)
if not (199 < r.status_code < 300): # Did not get it
LOGGER.error(f'Error fetching URL: {url}')
return 1
html = r.content
else:
try:
slug = utils.slugify(title, lang='')
except TypeError:
slug = utils.slugify(title)
nodes = list(libextract.api.extract(r.content))
# Let's assume the node with more text is the good one
lengths = [len(n.text_content()) for n in nodes]
node = nodes[lengths.index(max(lengths))]
document = doc_template.format(
title=title,
slug=slug,
content=lxml.html.tostring(node, encoding='utf8', method='html', pretty_print=True).decode('utf8')
)
with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf:
outf.write(document)

with open(url, 'rb') as f:
html = f.read()
except FileNotFoundError:
LOGGER.error(f'Error file does not exist: {url}')
return 1
except (OSError, IOError) as e:
LOGGER.error(f'Error opening file "{url}": {e}')
return 1

try:
soup = BeautifulSoup(html, "lxml")
except ImportError:
soup = BeautifulSoup(html, "html.parser")

title = soup.title.text if soup.title else "Untitled Page"
try:
slug = utils.slugify(title, lang='')
except TypeError:
slug = utils.slugify(title)

candidates = soup.find_all(["p", "div", "article", "section"])
if candidates:
node = max(candidates, key=lambda n: len(n.get_text(strip=True)))
else:
LOGGER.error('Error fetching URL: {}'.format(url))
node = None # empty

document = doc_template.format(
title=title,
slug=slug,
content=node.prettify()
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we leave the HTML as it is? If it is 'article' should I just get 'article' content (mostly likely the website template would already have 'article' element) ???

Copy link
Copy Markdown
Author

@jirib jirib Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to add a functionality for this topic, see ab15bed .

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should remove the wrapper element (e.g. <article>) by default.

)
with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf:
outf.write(document)
File renamed without changes.