From 9dbd5044a4515332ffbdb8211bd85bb36ac491ef Mon Sep 17 00:00:00 2001 From: Duan Yao Date: Sat, 21 Mar 2015 14:56:38 +0800 Subject: [PATCH] Convert illegal html unicode without glyph to space or zero-width space. --- 3rdparty/poppler/git/CairoFontEngine.cc | 8 +++-- 3rdparty/poppler/git/CairoFontEngine.h | 6 ++-- src/HTMLRenderer/HTMLRenderer.h | 12 +++++++ src/HTMLRenderer/font.cc | 26 +++++++++++++++ src/HTMLRenderer/general.cc | 12 +++++++ src/HTMLRenderer/text.cc | 42 ++++++++++++++++--------- src/HTMLTextLine.cc | 26 +++++++++++---- src/HTMLTextLine.h | 3 ++ src/util/unicode.h | 2 ++ 9 files changed, 112 insertions(+), 25 deletions(-) diff --git a/3rdparty/poppler/git/CairoFontEngine.cc b/3rdparty/poppler/git/CairoFontEngine.cc index 229a86cf0..b4dfd57c0 100644 --- a/3rdparty/poppler/git/CairoFontEngine.cc +++ b/3rdparty/poppler/git/CairoFontEngine.cc @@ -377,6 +377,7 @@ _ft_new_face (FT_Library lib, CairoFreeTypeFont::CairoFreeTypeFont(Ref ref, cairo_font_face_t *cairo_font_face, + FT_Face ft_face, int *codeToGID, Guint codeToGIDLen, GBool substitute) : CairoFont(ref, @@ -384,7 +385,10 @@ CairoFreeTypeFont::CairoFreeTypeFont(Ref ref, codeToGID, codeToGIDLen, substitute, - gTrue) { } + gTrue), + // Caution: this field is added by pdf2htmlEX to determine whitespace. Please merge during update. + ft_face(ft_face) + { } CairoFreeTypeFont::~CairoFreeTypeFont() { } @@ -546,7 +550,7 @@ CairoFreeTypeFont *CairoFreeTypeFont::create(GfxFont *gfxFont, XRef *xref, delete fontLoc; return new CairoFreeTypeFont(ref, - font_face, + font_face, face, codeToGID, codeToGIDLen, substitute); diff --git a/3rdparty/poppler/git/CairoFontEngine.h b/3rdparty/poppler/git/CairoFontEngine.h index 432f10715..dd2f957d5 100644 --- a/3rdparty/poppler/git/CairoFontEngine.h +++ b/3rdparty/poppler/git/CairoFontEngine.h @@ -75,10 +75,12 @@ class CairoFreeTypeFont : public CairoFont { public: static CairoFreeTypeFont *create(GfxFont *gfxFont, XRef *xref, FT_Library lib, GBool useCIDs); virtual ~CairoFreeTypeFont(); - + // Caution: this function is added by pdf2htmlEX to determine whitespace. Please merge during update. + FT_Face get_ft_face() { return ft_face; } private: - CairoFreeTypeFont(Ref ref, cairo_font_face_t *cairo_font_face, + CairoFreeTypeFont(Ref ref, cairo_font_face_t *cairo_font_face, FT_Face ft_face, int *codeToGID, Guint codeToGIDLen, GBool substitute); + FT_Face ft_face; }; //------------------------------------------------------------------------ diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h index 18e395d58..738aff257 100644 --- a/src/HTMLRenderer/HTMLRenderer.h +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -12,6 +12,8 @@ #include #include +#include +#include FT_FREETYPE_H #include #include #include @@ -42,6 +44,7 @@ #include "util/const.h" #include "util/misc.h" +class CairoFontEngine; namespace pdf2htmlEX { @@ -217,6 +220,10 @@ struct HTMLRenderer : OutputDev // make sure the current HTML style consistent with PDF void prepare_text_line(GfxState * state); + // Check whether this char has a non-empty glyph in this font. If not sure, return true. + // A char has an empty glyph or no glyph is usually a whitespace. + bool has_glyph(CharCode code, GfxFont* font); + //////////////////////////////////////////////////// // PDF stuffs //////////////////////////////////////////////////// @@ -341,6 +348,11 @@ struct HTMLRenderer : OutputDev CoveredTextDetector covered_text_detector; DrawingTracer tracer; + +#if ENABLE_SVG + FT_Library ft_lib; + std::unique_ptr font_engine; +#endif }; } //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc index 2ca9fa31a..5bb52f2f8 100644 --- a/src/HTMLRenderer/font.cc +++ b/src/HTMLRenderer/font.cc @@ -38,6 +38,7 @@ #include "CairoFontEngine.h" #include "CairoOutputDev.h" #include +#include FT_OUTLINE_H #endif namespace pdf2htmlEX { @@ -1082,4 +1083,29 @@ void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, cons f_css.fs << "}" << endl; } +bool HTMLRenderer::has_glyph(CharCode code, GfxFont* font) +{ +#if ENABLE_SVG + if (font->getType() == fontType3) + return true; + CairoFreeTypeFont* ftfont = (CairoFreeTypeFont*)font_engine->getFont(font, cur_doc, false, xref); + if (ftfont == nullptr) + return false; + FT_Face face = ftfont->get_ft_face(); + if (face == nullptr) + return false; + auto gid = ftfont->getGlyph(code, nullptr, 0); + // gid == 0 means no glyph + if (gid == 0) + return false; + if (FT_Load_Glyph(face, gid, FT_LOAD_NO_SCALE)) + return false; + FT_GlyphSlot slot = face->glyph; + // n_contours == 0 means an empty glyph + if (slot->format == FT_GLYPH_FORMAT_OUTLINE && slot->outline.n_contours == 0) + return false; +#endif + return true; +} + } //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc index 6a54194e5..de555df50 100644 --- a/src/HTMLRenderer/general.cc +++ b/src/HTMLRenderer/general.cc @@ -29,6 +29,10 @@ #include "util/css_const.h" #include "util/encoding.h" +#if ENABLE_SVG +#include "CairoFontEngine.h" +#endif + namespace pdf2htmlEX { using std::fixed; @@ -86,11 +90,19 @@ HTMLRenderer::HTMLRenderer(const Param & param) [this](double * box, bool partial) { covered_text_detector.add_char_bbox_clipped(box, partial); }; tracer.on_non_char_drawn = [this](double * box) { covered_text_detector.add_non_char_bbox(box); }; + +#if ENABLE_SVG + FT_Init_FreeType(&ft_lib); + font_engine = std::unique_ptr(new CairoFontEngine(ft_lib)); +#endif } HTMLRenderer::~HTMLRenderer() { ffw_finalize(); +#if ENABLE_SVG + FT_Done_FreeType(ft_lib); +#endif } void HTMLRenderer::process(PDFDoc *doc) diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc index e58a17a77..a86b3cd4b 100644 --- a/src/HTMLRenderer/text.cc +++ b/src/HTMLRenderer/text.cc @@ -74,7 +74,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) while (len > 0) { auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy); - HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0])); + HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)%s\n", (wchar_t)u[0], u[0], has_glyph(code, font) ? "":" no glyph")); if(!(equal(ox, 0) && equal(oy, 0))) { @@ -113,24 +113,36 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s) } else { - Unicode uu; - if(cur_text_state.font_info->use_tounicode) + if (uLen == 1 && is_illegal_unicode(u[0]) && !has_glyph(code, font)) { - uu = check_unicode(u, uLen, code, font); + // Convert illegal html unicode to a whitespace, if it has no glyph. + // Add a zero-width space AFTER the offset to make sure words are + // delimited, and make sure the ZWSP can be optimized out if the + // offset is represented by a space (see HTMLTextLine::dump_unicode). + html_text_page.get_cur_line()->append_offset(ddx * draw_text_scale); + html_text_page.get_cur_line()->append_unicodes(&zero_width_space, 1, 0); } else { - uu = unicode_from_font(code, font); - } - html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx); - /* - * In PDF, word_space is appended if (n == 1 and *p = ' ') - * but in HTML, word_space is appended if (uu == ' ') - */ - int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0); - if(space_count != 0) - { - html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count); + Unicode uu; + if(cur_text_state.font_info->use_tounicode) + { + uu = check_unicode(u, uLen, code, font); + } + else + { + uu = unicode_from_font(code, font); + } + html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx); + /* + * In PDF, word_space is appended if (n == 1 and *p = ' ') + * but in HTML, word_space is appended if (uu == ' ') + */ + int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0); + if(space_count != 0) + { + html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count); + } } } } diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc index a0be2865d..7d53dd015 100644 --- a/src/HTMLTextLine.cc +++ b/src/HTMLTextLine.cc @@ -13,6 +13,7 @@ #include "util/encoding.h" #include "util/css_const.h" +#include "util/unicode.h" namespace pdf2htmlEX { @@ -32,6 +33,7 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para ,clip_x1(0) ,clip_y1(0) ,width(0) + ,last_output_unicode(0) { } void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width) @@ -88,16 +90,25 @@ void HTMLTextLine::dump_char(std::ostream & out, int pos) int c = text[pos]; if (c > 0) { - Unicode u = c; - writeUnicodes(out, &u, 1); + dump_unicode(out, c); } else if (c < 0) { auto dt = decomposed_text[- c - 1]; - writeUnicodes(out, &dt.front(), dt.size()); + for (auto it = dt.begin(), end = dt.end(); it != end; it++) + dump_unicode(out, *it); } } +void HTMLTextLine::dump_unicode(std::ostream & out, Unicode u) +{ + // ZWSP following space can be optimized out. + if (u == zero_width_space && last_output_unicode == ' ') + return; + writeUnicodes(out, &u, 1); + last_output_unicode = u; +} + void HTMLTextLine::dump_chars(ostream & out, int begin, int len) { static const Color transparent(0, 0, 0, true); @@ -162,6 +173,7 @@ void HTMLTextLine::dump_text(ostream & out) << " " << CSS::BOTTOM_CN << all_manager.bottom.install(line_state.y - clip_y1) ; // it will be closed by the first state + last_output_unicode = 0; } std::vector stack; @@ -249,8 +261,7 @@ void HTMLTextLine::dump_text(ostream & out) double space_off = state_iter1->single_space_offset(); if(std::abs(target - space_off) <= param.h_eps) { - Unicode u = ' '; - writeUnicodes(out, &u, 1); + dump_unicode(out, ' '); actual_offset = space_off; done = true; } @@ -269,7 +280,10 @@ void HTMLTextLine::dump_text(ostream & out) double threshold = state_iter1->em_size() * (param.space_threshold); out << "" << (target > (threshold - EPS) ? " " : "") << ""; + << ' ' << CSS::WHITESPACE_CN << wid << "\">"; + if (target > (threshold - EPS)) + dump_unicode(out, ' '); + out << ""; } } } diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h index fcce81191..e95def85d 100644 --- a/src/HTMLTextLine.h +++ b/src/HTMLTextLine.h @@ -107,6 +107,7 @@ class HTMLTextLine */ void dump_chars(std::ostream & out, int begin, int len); void dump_char(std::ostream & out, int pos); + void dump_unicode(std::ostream & out, Unicode u); const Param & param; AllStateManager & all_manager; @@ -128,6 +129,8 @@ class HTMLTextLine */ std::vector text; std::vector > decomposed_text; + + Unicode last_output_unicode; //last unicode written to html (chars in tags excluded) }; } // namespace pdf2htmlEX diff --git a/src/util/unicode.h b/src/util/unicode.h index 21006955f..cf139f0ae 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -13,6 +13,8 @@ namespace pdf2htmlEX { +const Unicode zero_width_space = 0x200B; + /** * Check whether a unicode character is illegal for the output HTML. * Unlike PDF readers, browsers has special treatments for such characters (normally treated as