Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -142,4 +142,8 @@ dmypy.json
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
Thumbs.db

# PDM package manager
.pdm-python
pdm.lock
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ authors = [
]
readme = "README.md"
license = "MIT"
requires-python = ">=3.8"
requires-python = ">=3.9"
keywords = ["html", "css", "image", "conversion", "rendering"]
classifiers = [
"Development Status :: 3 - Alpha",
Expand All @@ -32,7 +32,7 @@ classifiers = [
dependencies = [
"pictex>=2.0.0",
"beautifulsoup4>=4.9.0",
"tinycss2>=1.1.0",
"tinycss2>=1.1.0"
]

[project.optional-dependencies]
Expand Down
57 changes: 54 additions & 3 deletions src/html2pic/html2pic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .dom import DOMNode
from .parsing import CSSRule
from .fonts import FontRegistry
from typing import Optional, List
from typing import Optional, List, Callable

class Html2Pic:
"""Convert HTML + CSS to images using PicTex.
Expand All @@ -27,9 +27,11 @@ class Html2Pic:
```
"""

def __init__(self, html: str, css: str = ""):
def __init__(self, html: str, css: str = "", fetcher: Optional[Callable[[str], str]] = None, root_url: Optional[str] = None):
self.html = html
self.css = css
self.fetcher = fetcher
self.root_url = root_url

self._html_parser: HtmlParser = HtmlParser()
self._css_parser: CssParser = CssParser()
Expand All @@ -48,19 +50,26 @@ def __init__(self, html: str, css: str = ""):
def dom_tree(self) -> DOMNode:
if self._dom_tree is None:
self._dom_tree = self._html_parser.parse(self.html)
# Extract CSS from DOM and append to self.css
extracted_css = self._extract_css_from_dom(self._dom_tree)
if extracted_css:
self.css = self.css + "\n\n" + extracted_css if self.css else extracted_css
return self._dom_tree

@property
def style_rules(self) -> List[CSSRule]:
if self._style_rules is None:
# Ensure DOM is parsed first (which extracts CSS)
_ = self.dom_tree
# Now parse all CSS including extracted content
self._style_rules, self._font_registry = self._css_parser.parse(self.css)
return self._style_rules

@property
def font_registry(self) -> FontRegistry:
if self._font_registry is None:
_ = self.style_rules
return self._font_registry
return self._font_registry # pyright: ignore[reportReturnType]

@property
def styled_tree(self) -> DOMNode:
Expand Down Expand Up @@ -104,6 +113,48 @@ def render_as_svg(self, embed_font: bool = True) -> pictex.VectorImage:
self._print_warnings()
raise RenderError(f"Failed to render SVG: {e}") from e

def _extract_css_from_dom(self, node: DOMNode) -> str:
"""Extract CSS from <style> tags and <link> tags."""
css_content = []

def traverse(n: DOMNode):
if n.is_element():
# Extract <style> tag content
if n.tag and n.tag.lower() == 'style':
text = n.get_all_text().strip()
if text:
css_content.append(text)

# Extract <link rel="stylesheet"> tags
elif n.tag and n.tag.lower() == 'link':
rel = n.attributes.get('rel', '').lower()
if rel == 'stylesheet':
if not self.fetcher:
self._warnings.warn_unexpected_error("No fetcher provided.")
return
href = n.attributes.get('href')
if href:
try:
# Resolve URL if root_url is provided
if self.root_url and not href.startswith(('http://', 'https://')):
from urllib.parse import urljoin
href = urljoin(self.root_url, href)

# Fetch CSS content
# FIXME: Workaround; CSS3 @import handling is not implemented here
fetched_css = self.fetcher(href)
if fetched_css:
css_content.append(fetched_css)
except Exception as e:
self._warnings.warn_unexpected_error(f"Failed to fetch CSS from {href}: {e}")

# Traverse children
for child in n.children:
traverse(child)

traverse(node)
return '\n\n'.join(css_content)

def debug_info(self) -> Dict[str, Any]:
return {
"dom_tree": self.dom_tree,
Expand Down
2 changes: 1 addition & 1 deletion src/html2pic/parsing/css_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ..fonts import FontFace, FontRegistry, FontSrcParser
from ..warnings import get_warning_collector
from ..exceptions import ParseError
from ..styiling import DEFAULT_STYLES
from ..styling import DEFAULT_STYLES


class CssParser:
Expand Down
48 changes: 25 additions & 23 deletions src/html2pic/parsing/html_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""HTML parser using BeautifulSoup."""

from typing import Optional

from bs4 import BeautifulSoup, NavigableString, Tag
from bs4 import BeautifulSoup, Tag
from bs4.element import NavigableString

from ..dom import DOMNode, NodeType
from ..warnings import get_warning_collector
Expand All @@ -10,67 +12,67 @@

class HtmlParser:
"""Parses HTML content into a DOM tree."""
SKIP_TAGS = {'script', 'style', 'head', 'meta', 'link', 'title'}

SKIP_TAGS = {"script", "meta", "title"}

def __init__(self):
self.warnings = get_warning_collector()

def parse(self, html_content: str) -> DOMNode:
"""Parse HTML string into a DOM tree."""
try:
soup = BeautifulSoup(html_content, 'html.parser')
soup = BeautifulSoup(html_content, "html.parser")
return self._create_root(soup)
except Exception as e:
raise ParseError(f"Failed to parse HTML: {e}") from e

def _create_root(self, soup: BeautifulSoup) -> DOMNode:
root = DOMNode(node_type=NodeType.ELEMENT, tag='div', attributes={'class': '_root'})

root = DOMNode(
node_type=NodeType.ELEMENT, tag="div", attributes={"class": "_root"}
)

for child in soup.children:
child_node = self._process_node(child, root)
if child_node:
root.children.append(child_node)

return root

def _process_node(self, node, parent: DOMNode) -> Optional[DOMNode]:
if isinstance(node, NavigableString):
text = str(node).strip()
if text:
return DOMNode(
node_type=NodeType.TEXT,
text_content=text,
parent=parent
node_type=NodeType.TEXT, text_content=text, parent=parent
)
return None

if isinstance(node, Tag):
tag_name = node.name.lower()

if tag_name in self.SKIP_TAGS:
self.warnings.warn_unsupported_html_tag(tag_name)
return None

attrs = {}
for key, value in node.attrs.items():
if isinstance(value, list):
attrs[key] = ' '.join(value)
attrs[key] = " ".join(value)
else:
attrs[key] = value

dom_node = DOMNode(
node_type=NodeType.ELEMENT,
tag=tag_name,
attributes=attrs,
parent=parent
parent=parent,
)

for child in node.children:
child_node = self._process_node(child, dom_node)
if child_node:
dom_node.children.append(child_node)

return dom_node

return None
7 changes: 6 additions & 1 deletion src/html2pic/translation/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
class PicTexTranslator:
"""Translates a styled DOM tree into PicTex builders."""

SKIP_TAGS = {"link", "style", "head"}

def __init__(self):
self.warnings = get_warning_collector()
self.element_factory = ElementFactory()
Expand All @@ -35,7 +37,7 @@ def __init__(self):
def translate(
self,
styled_dom: DOMNode,
font_registry: FontRegistry = None
font_registry: Optional[FontRegistry] = None
) -> Tuple[Canvas, Optional[Element]]:
"""Translate a styled DOM tree to PicTex builders."""
self.font_registry = font_registry
Expand Down Expand Up @@ -73,6 +75,9 @@ def _create_text(self, node: DOMNode) -> Optional[Text]:
return self.element_factory.create_text(content)

def _create_element(self, node: DOMNode) -> Optional[Element]:
if node.tag and node.tag.lower() in self.SKIP_TAGS:
return None

styles = node.computed_styles

if styles.get('display') == 'none':
Expand Down