Skip to content
This repository was archived by the owner on Dec 16, 2025. It is now read-only.

Commit 7e8a4f5

Browse files
committed
Clone MD parser to create new MD parser and add more tests.
1 parent e6633a8 commit 7e8a4f5

5 files changed

Lines changed: 1787 additions & 0 deletions

File tree

claat/parser/newmd/README.md

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# Markdown Parser
2+
3+
The Markdown codelab parser converts a Markdown document into an HTML file and
4+
codelab metadata file.
5+
6+
## Metadata
7+
8+
Metadata consists of key-value pairs of the form "key: value". Keys cannot
9+
contain colons, and separate metadata fields must be separated by blank lines.
10+
At present, values must all be on one line. All metadata must come before the
11+
title. Any arbitrary keys and values may be used; however, only the following
12+
will be understood by the renderer:
13+
14+
- Summary: A human-readable summary of the codelab. Defaults to blank.
15+
- Id: An identifier composed of lowercase letters ideally describing the
16+
content of the codelab. This field should be unique among
17+
codelabs.
18+
- Categories: A comma-separated list of the topics the codelab covers.
19+
- Environments: A list of environments the codelab should be discoverable in.
20+
Codelabs marked "Web" will be visible at the codelabs index. Codelabs marked
21+
"Kiosk" will only be available at codelabs kiosks, which have special
22+
equipment attached.
23+
- Status: The publication status of the codelab. Valid values are:
24+
- Draft: Codelab is not finished.
25+
- Published: Codelab is finished and visible.
26+
- Deprecated: Codelab is considered stale and should not be widely advertised.
27+
- Hidden: Codelab is not shown in index.
28+
- Feedback Link: A link to send users to if they wish to leave feedback on the
29+
codelab.
30+
- Analytics Account: A Google Analytics ID to include with all codelab pages.
31+
32+
## Title
33+
34+
The title of the codelab directly follows the metadata. The title is a Header 1.
35+
36+
```
37+
# Title of codelab
38+
```
39+
40+
## Steps
41+
42+
A step is declared by putting the step's title in a Header 2. All content
43+
following a step title will be considered part of the step, until the next step
44+
title, or the end of the document.
45+
46+
```
47+
## Codelab Step
48+
```
49+
50+
### Duration
51+
52+
Steps should be marked with the expected duration to complete them. To label a
53+
step with a duration, put "Duration: TIME" by itself on the line directly
54+
following the step title, where TIME is formatted like "hh:mm:ss" (or "mm:ss" if
55+
only one `:` is provided).
56+
57+
```
58+
## Codelab Step
59+
Duration: 1:25
60+
```
61+
62+
### Content
63+
64+
Codelab content may be written in standard Markdown. Some special constructs are
65+
understood:
66+
67+
#### Fenced Code and Language Hints
68+
69+
Code blocks may be declared by placing them between two lines containing just
70+
three backticks (fenced code blocks). The codelab renderer will attempt to
71+
perform syntax highlighting on code blocks, but it is not always effective at
72+
guessing the language to highlight in. Put the name of the code language after
73+
the first fence to explicitly specify which highlighting plan to use.
74+
75+
``` go
76+
This block will be highlighted as Go source code.
77+
```
78+
79+
#### Info Boxes
80+
81+
Info boxes are colored callouts that enclose special information in codelabs.
82+
Positive info boxes should contain positive information like best practices and
83+
time-saving tips. Negative infoboxes should contain information like warnings
84+
and API usage restriction. To create an infobox, put the type of infobox on a
85+
line by itself, then begin the next line with a colon.
86+
87+
```
88+
Positive
89+
: This will appear in a positive info box.
90+
91+
Negative
92+
: This will appear in a negative info box.
93+
```
94+
95+
#### Download Buttons
96+
97+
Codelabs sometimes contain links to SDKs or sample code. The codelab renderer
98+
will apply special button-esque styling to any link that begins with the word
99+
"Download".
100+
101+
```
102+
[Download SDK](https://www.google.com)
103+
```
104+

claat/parser/newmd/html.go

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
// Copyright 2016 Google Inc. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package md
16+
17+
import (
18+
"bytes"
19+
"strings"
20+
21+
"golang.org/x/net/html"
22+
"golang.org/x/net/html/atom"
23+
)
24+
25+
var (
26+
// headerLevel maps HTML tags to their level in parser.HeaderNode.
27+
// we -1 as H2 is a new step
28+
headerLevel = map[atom.Atom]int{
29+
atom.H3: 2,
30+
atom.H4: 3,
31+
atom.H5: 4,
32+
atom.H6: 5,
33+
}
34+
)
35+
36+
// isHeader returns true if hn is one of secondary headers.
37+
// Step header is not one of them.
38+
func isHeader(hn *html.Node) bool {
39+
_, ok := headerLevel[hn.DataAtom]
40+
return ok
41+
}
42+
43+
func isMeta(hn *html.Node) bool {
44+
elem := strings.ToLower(hn.Data)
45+
return strings.HasPrefix(elem, metaDuration+metaSep) || strings.HasPrefix(elem, metaEnvironment+metaSep)
46+
}
47+
48+
func isBold(hn *html.Node) bool {
49+
if hn.Type == html.TextNode {
50+
hn = hn.Parent
51+
}
52+
return hn.DataAtom == atom.Strong ||
53+
hn.DataAtom == atom.B
54+
}
55+
56+
func isItalic(hn *html.Node) bool {
57+
if hn.Type == html.TextNode {
58+
hn = hn.Parent
59+
}
60+
return hn.DataAtom == atom.Em ||
61+
hn.DataAtom == atom.I
62+
}
63+
64+
func isConsole(hn *html.Node) bool {
65+
if hn.Type == html.TextNode {
66+
hn = hn.Parent
67+
}
68+
return hn.DataAtom == atom.Code && hn.Parent.DataAtom != atom.Pre
69+
}
70+
71+
func isCode(hn *html.Node) bool {
72+
if hn.Type == html.TextNode {
73+
hn = hn.Parent
74+
}
75+
return hn.DataAtom == atom.Code && hn.Parent.DataAtom == atom.Pre
76+
}
77+
78+
func isButton(hn *html.Node) bool {
79+
// TODO: implements
80+
return false
81+
}
82+
83+
func isInfobox(hn *html.Node) bool {
84+
if hn.DataAtom != atom.Dt {
85+
return false
86+
}
87+
return strings.ToLower(hn.FirstChild.Data) == "positive" || isInfoboxNegative(hn)
88+
}
89+
90+
func isInfoboxNegative(hn *html.Node) bool {
91+
if hn.DataAtom != atom.Dt {
92+
return false
93+
}
94+
return strings.ToLower(hn.FirstChild.Data) == "negative"
95+
}
96+
97+
func isSurvey(hn *html.Node) bool {
98+
if hn.DataAtom != atom.Dt {
99+
return false
100+
}
101+
return strings.ToLower(hn.FirstChild.Data) == "survey"
102+
}
103+
104+
func isTable(hn *html.Node) bool {
105+
if hn.DataAtom != atom.Table {
106+
return false
107+
}
108+
return countTwo(hn, atom.Tr) > 1 || countTwo(hn, atom.Td) > 1
109+
}
110+
111+
func isList(hn *html.Node) bool {
112+
return hn.DataAtom == atom.Ul || hn.DataAtom == atom.Ol
113+
}
114+
115+
// countTwo starts counting the number of a Atom children in hn.
116+
// It returns as soon as the count exceeds 1, so the returned value is inexact.
117+
//
118+
// The callers can test for > 1 to verify whether a node contains two
119+
// or more children of the Atom a.
120+
func countTwo(hn *html.Node, a atom.Atom) int {
121+
var count int
122+
for c := hn.FirstChild; c != nil; c = c.NextSibling {
123+
if c.DataAtom == a {
124+
count++
125+
} else {
126+
count += countTwo(c, a)
127+
}
128+
if count > 1 {
129+
break
130+
}
131+
}
132+
return count
133+
}
134+
135+
// countDirect returns the number of immediate children of hn.
136+
func countDirect(hn *html.Node) int {
137+
var count int
138+
for c := hn.FirstChild; c != nil; c = c.NextSibling {
139+
count++
140+
}
141+
return count
142+
}
143+
144+
// findAtom returns first child of root which matches a, nil otherwise.
145+
// It returns root if it is the same Atom as a.
146+
func findAtom(root *html.Node, a atom.Atom) *html.Node {
147+
if root.DataAtom == a {
148+
return root
149+
}
150+
for c := root.FirstChild; c != nil; c = c.NextSibling {
151+
if v := findAtom(c, a); v != nil {
152+
return v
153+
}
154+
}
155+
return nil
156+
}
157+
158+
func findChildAtoms(root *html.Node, a atom.Atom) []*html.Node {
159+
var nodes []*html.Node
160+
for hn := root.FirstChild; hn != nil; hn = hn.NextSibling {
161+
if hn.DataAtom == a {
162+
nodes = append(nodes, hn)
163+
}
164+
nodes = append(nodes, findChildAtoms(hn, a)...)
165+
}
166+
return nodes
167+
}
168+
169+
// findParent is like findAtom but search is in the opposite direction.
170+
// It is faster to look for parent than child lookup in findAtom.
171+
func findParent(root *html.Node, a atom.Atom) *html.Node {
172+
if root.DataAtom == a {
173+
return root
174+
}
175+
for c := root.Parent; c != nil; c = c.Parent {
176+
if c.DataAtom == a {
177+
return c
178+
}
179+
}
180+
return nil
181+
}
182+
183+
var blockParents = map[atom.Atom]struct{}{
184+
atom.H1: {},
185+
atom.H2: {},
186+
atom.H3: {},
187+
atom.H4: {},
188+
atom.H5: {},
189+
atom.H6: {},
190+
atom.Li: {},
191+
atom.P: {},
192+
atom.Div: {},
193+
}
194+
195+
// findBlockParent looks up nearest block parent node of hn.
196+
// For instance, block parent of "text" in <ul><li>text</li></ul> is <li>,
197+
// while block parent of "text" in <p><span>text</span></p> is <p>.
198+
func findBlockParent(hn *html.Node) *html.Node {
199+
for p := hn.Parent; p != nil; p = p.Parent {
200+
if _, ok := blockParents[p.DataAtom]; ok {
201+
return p
202+
}
203+
}
204+
return nil
205+
}
206+
207+
// nodeAttr returns node attribute value of the key name.
208+
// Attribute keys are case insensitive.
209+
func nodeAttr(n *html.Node, name string) string {
210+
name = strings.ToLower(name)
211+
for _, a := range n.Attr {
212+
if strings.ToLower(a.Key) == name {
213+
return a.Val
214+
}
215+
}
216+
return ""
217+
}
218+
219+
// stringifyNode extracts and concatenates all text nodes starting with root.
220+
// Line breaks are inserted at <br> and any non-<span> elements.
221+
func stringifyNode(root *html.Node, trim bool) string {
222+
if root.Type == html.TextNode {
223+
s := textCleaner.Replace(root.Data)
224+
if !trim {
225+
return s
226+
}
227+
return strings.TrimSpace(s)
228+
}
229+
if root.DataAtom == atom.Br && !trim {
230+
return "\n"
231+
}
232+
var buf bytes.Buffer
233+
for c := root.FirstChild; c != nil; c = c.NextSibling {
234+
if c.DataAtom == atom.Br {
235+
buf.WriteRune('\n')
236+
continue
237+
}
238+
if c.Type == html.TextNode {
239+
buf.WriteString(c.Data)
240+
continue
241+
}
242+
if c.DataAtom != atom.Span && c.DataAtom != atom.A {
243+
buf.WriteRune('\n')
244+
}
245+
buf.WriteString(stringifyNode(c, false))
246+
}
247+
s := textCleaner.Replace(buf.String())
248+
if !trim {
249+
return s
250+
}
251+
return strings.TrimSpace(s)
252+
}

0 commit comments

Comments
 (0)