之前google开发了一个库,strip,后来不知为何(也许是嫌太重)丢到一边,下方是原库代码,将之复制到src/strip/strip.go里就可以跑起来。
// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package strip import ( "bytes" "encoding/json" "fmt" "html" "io" "io/ioutil" "path/filepath" "reflect" "strings" "sync" "text/template" "text/template/parse" "unicode" "unicode/utf8" ) // htmlNospaceEscaper escapes for inclusion in unquoted attribute values. func htmlNospaceEscaper(args ...interface{}) string { s, t := stringify(args...) if t == contentTypeHTML { return htmlReplacer(StripTags(s), htmlNospaceNormReplacementTable, false) } return htmlReplacer(s, htmlNospaceReplacementTable, false) } // attrEscaper escapes for inclusion in quoted attribute values. func attrEscaper(args ...interface{}) string { s, t := stringify(args...) if t == contentTypeHTML { return htmlReplacer(StripTags(s), htmlNormReplacementTable, true) } return htmlReplacer(s, htmlReplacementTable, true) } // rcdataEscaper escapes for inclusion in an RCDATA element body. func rcdataEscaper(args ...interface{}) string { s, t := stringify(args...) if t == contentTypeHTML { return htmlReplacer(s, htmlNormReplacementTable, true) } return htmlReplacer(s, htmlReplacementTable, true) } // htmlEscaper escapes for inclusion in HTML text. func htmlEscaper(args ...interface{}) string { s, t := stringify(args...) if t == contentTypeHTML { return s } return htmlReplacer(s, htmlReplacementTable, true) } // htmlReplacementTable contains the runes that need to be escaped // inside a quoted attribute value or in a text node. var htmlReplacementTable = []string{ // http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state // U+0000 NULL Parse error. Append a U+FFFD REPLACEMENT // CHARACTER character to the current attribute's value. // " // and similarly // http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state 0: "\uFFFD", '"': """, '&': "&", '\'': "'", '+': "+", '<': "<", '>': ">", } // htmlNormReplacementTable is like htmlReplacementTable but without '&' to // avoid over-encoding existing entities. var htmlNormReplacementTable = []string{ 0: "\uFFFD", '"': """, '\'': "'", '+': "+", '<': "<", '>': ">", } // htmlNospaceReplacementTable contains the runes that need to be escaped // inside an unquoted attribute value. // The set of runes escaped is the union of the HTML specials and // those determined by running the JS below in browsers: // <div id=d></div> // <script>(function () { // var a = [], d = document.getElementById("d"), i, c, s; // for (i = 0; i < 0x10000; ++i) { // c = String.fromCharCode(i); // d.innerHTML = "<span title=" + c + "lt" + c + "></span>" // s = d.getElementsByTagName("SPAN")[0]; // if (!s || s.title !== c + "lt" + c) { a.push(i.toString(16)); } // } // document.write(a.join(", ")); // })()</script> var htmlNospaceReplacementTable = []string{ 0: "�", '\t': "	", '\n': " ", '\v': "", '\f': "", '\r': " ", ' ': " ", '"': """, '&': "&", '\'': "'", '+': "+", '<': "<", '=': "=", '>': ">", // A parse error in the attribute value (unquoted) and // before attribute value states. // Treated as a quoting character by IE. '`': "`", } // htmlNospaceNormReplacementTable is like htmlNospaceReplacementTable but // without '&' to avoid over-encoding existing entities. var htmlNospaceNormReplacementTable = []string{ 0: "�", '\t': "	", '\n': " ", '\v': "", '\f': "", '\r': " ", ' ': " ", '"': """, '\'': "'", '+': "+", '<': "<", '=': "=", '>': ">", // A parse error in the attribute value (unquoted) and // before attribute value states. // Treated as a quoting character by IE. '`': "`", } // htmlReplacer returns s with runes replaced according to replacementTable // and when badRunes is true, certain bad runes are allowed through unescaped. func htmlReplacer(s string, replacementTable []string, badRunes bool) string { written, b := 0, new(bytes.Buffer) for i, r := range s { if int(r) < len(replacementTable) { if repl := replacementTable[r]; len(repl) != 0 { b.WriteString(s[written:i]) b.WriteString(repl) // Valid as long as replacementTable doesn't // include anything above 0x7f. written = i + utf8.RuneLen(r) } } else if badRunes { // No-op. // IE does not allow these ranges in unquoted attrs. } else if 0xfdd0 <= r && r <= 0xfdef || 0xfff0 <= r && r <= 0xffff { fmt.Fprintf(b, "%s&#x%x;", s[written:i], r) written = i + utf8.RuneLen(r) } } if written == 0 { return s } b.WriteString(s[written:]) return b.String() } // stripTags takes a snippet of HTML and returns only the text content. // For example, `<b>¡Hi!</b> <script>...</script>` -> `¡Hi! `. func StripTags(html string) string { var b bytes.Buffer s, c, i, allText := []byte(html), context{}, 0, true // Using the transition funcs helps us avoid mangling // `<div title="1>2">` or `I <3 Ponies!`. for i != len(s) { if c.delim == delimNone { st := c.state // Use RCDATA instead of parsing into JS or CSS styles. if c.element != elementNone && !isInTag(st) { st = stateRCDATA } d, nread := transitionFunc[st](c, s[i:]) i1 := i + nread if c.state == stateText || c.state == stateRCDATA { // Emit text up to the start of the tag or comment. j := i1 if d.state != c.state { for j1 := j - 1; j1 >= i; j1-- { if s[j1] == '<' { j = j1 break } } } b.Write(s[i:j]) } else { allText = false } c, i = d, i1 continue } i1 := i + bytes.IndexAny(s[i:], delimEnds[c.delim]) if i1 < i { break } if c.delim != delimSpaceOrTagEnd { // Consume any quote. i1++ } c, i = context{state: stateTag, element: c.element}, i1 } if allText { return html } else if c.state == stateText || c.state == stateRCDATA { b.Write(s[i:]) } return b.String() } // htmlNameFilter accepts valid parts of an HTML attribute or tag name or // a known-safe HTML attribute. func htmlNameFilter(args ...interface{}) string { s, t := stringify(args...) if t == contentTypeHTMLAttr { return s } if len(s) == 0 { // Avoid violation of structure preservation. // <input checked {{.K}}={{.V}}>. // Without this, if .K is empty then .V is the value of // checked, but otherwise .V is the value of the attribute // named .K. return filterFailsafe } s = strings.ToLower(s) if t := attrType(s); t != contentTypePlain { // TODO: Split attr and element name part filters so we can whitelist // attributes. return filterFailsafe } for _, r := range s { switch { case '0' <= r && r <= '9': case 'a' <= r && r <= 'z': default: return filterFailsafe } } return s } // commentEscaper returns the empty string regardless of input. // Comment content does not correspond to any parsed structure or // human-readable content, so the simplest and most secure policy is to drop // content interpolated into comments. // This approach is equally valid whether or not static comment content is // removed from the template. func commentEscaper(args ...interface{}) string { return "" } // Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // context describes the state an HTML parser must be in when it reaches the // portion of HTML produced by evaluating a particular template node. // // The zero value of type context is the start context for a template that // produces an HTML fragment as defined at // http://www.w3.org/TR/html5/syntax.html#the-end // where the context element is null. type context struct { state state delim delim urlPart urlPart jsCtx jsCtx attr attr element element err *Error } func (c context) String() string { return fmt.Sprintf("{%v %v %v %v %v %v %v}", c.state, c.delim, c.urlPart, c.jsCtx, c.attr, c.element, c.err) } // eq reports whether two contexts are equal. func (c context) eq(d context) bool { return c.state == d.state && c.delim == d.delim && c.urlPart == d.urlPart && c.jsCtx == d.jsCtx && c.attr == d.attr && c.element == d.element && c.err == d.err } // mangle produces an identifier that includes a suffix that distinguishes it // from template names mangled with different contexts. func (c context) mangle(templateName string) string { // The mangled name for the default context is the input templateName. if c.state == stateText { return templateName } s := templateName + "$htmltemplate_" + c.state.String() if c.delim != 0 { s += "_" + c.delim.String() } if c.urlPart != 0 { s += "_" + c.urlPart.String() } if c.jsCtx != 0 { s += "_" + c.jsCtx.String() } if c.attr != 0 { s += "_" + c.attr.String() } if c.element != 0 { s += "_" + c.element.String() } return s } // state describes a high-level HTML parser state. // // It bounds the top of the element stack, and by extension the HTML insertion // mode, but also contains state that does not correspond to anything in the // HTML5 parsing algorithm because a single token production in the HTML // grammar may contain embedded actions in a template. For instance, the quoted // HTML attribute produced by // <div title="Hello {{.World}}"> // is a single token in HTML's grammar but in a template spans several nodes. type state uint8 const ( // stateText is parsed character data. An HTML parser is in // this state when its parse position is outside an HTML tag, // directive, comment, and special element body. stateText state = iota // stateTag occurs before an HTML attribute or the end of a tag. stateTag // stateAttrName occurs inside an attribute name. // It occurs between the ^'s in ` ^name^ = value`. stateAttrName // stateAfterName occurs after an attr name has ended but before any // equals sign. It occurs between the ^'s in ` name^ ^= value`. stateAfterName // stateBeforeValue occurs after the equals sign but before the value. // It occurs between the ^'s in ` name =^ ^value`. stateBeforeValue // stateHTMLCmt occurs inside an <!-- HTML comment -->. stateHTMLCmt // stateRCDATA occurs inside an RCDATA element (<textarea> or <title>) // as described at http://www.w3.org/TR/html5/syntax.html#elements-0 stateRCDATA // stateAttr occurs inside an HTML attribute whose content is text. stateAttr // stateURL occurs inside an HTML attribute whose content is a URL. stateURL // stateJS occurs inside an event handler or script element. stateJS // stateJSDqStr occurs inside a JavaScript double quoted string. stateJSDqStr // stateJSSqStr occurs inside a JavaScript single quoted string. stateJSSqStr // stateJSRegexp occurs inside a JavaScript regexp literal. stateJSRegexp // stateJSBlockCmt occurs inside a JavaScript /* block comment */. stateJSBlockCmt // stateJSLineCmt occurs inside a JavaScript // line comment. stateJSLineCmt // stateCSS occurs inside a <style> element or style attribute. stateCSS // stateCSSDqStr occurs inside a CSS double quoted string. stateCSSDqStr // stateCSSSqStr occurs inside a CSS single quoted string. stateCSSSqStr // stateCSSDqURL occurs inside a CSS double quoted url("..."). stateCSSDqURL // stateCSSSqURL occurs inside a CSS single quoted url('...'). stateCSSSqURL // stateCSSURL occurs inside a CSS unquoted url(...). stateCSSURL // stateCSSBlockCmt occurs inside a CSS /* block comment */. stateCSSBlockCmt // stateCSSLineCmt occurs inside a CSS // line comment. stateCSSLineCmt // stateError is an infectious error state outside any valid // HTML/CSS/JS construct. stateError ) var stateNames = [...]string{ stateText: "stateText", stateTag: "stateTag", stateAttrName: "stateAttrName", stateAfterName: "stateAfterName", stateBeforeValue: "stateBeforeValue", stateHTMLCmt: "stateHTMLCmt", stateRCDATA: "stateRCDATA", stateAttr: "stateAttr", stateURL: "stateURL", stateJS: "stateJS", stateJSDqStr: "stateJSDqStr", stateJSSqStr: "stateJSSqStr", stateJSRegexp: "stateJSRegexp", stateJSBlockCmt: "stateJSBlockCmt", stateJSLineCmt: "stateJSLineCmt", stateCSS: "stateCSS", stateCSSDqStr: "stateCSSDqStr", stateCSSSqStr: "stateCSSSqStr", stateCSSDqURL: "stateCSSDqURL", stateCSSSqURL: "stateCSSSqURL", stateCSSURL: "stateCSSURL", stateCSSBlockCmt: "stateCSSBlockCmt", stateCSSLineCmt: "stateCSSLineCmt", stateError: "stateError", } func (s state) String() string { if int(s) < len(stateNames) { return stateNames[s] } return fmt.Sprintf("illegal state %d", int(s)) } // isComment is true for any state that contains content meant for template // authors & maintainers, not for end-users or machines. func isComment(s state) bool { switch s { case stateHTMLCmt, stateJSBlockCmt, stateJSLineCmt, stateCSSBlockCmt, stateCSSLineCmt: return true } return false } // isInTag return whether s occurs solely inside an HTML tag. func isInTag(s state) bool { switch s { case stateTag, stateAttrName, stateAfterName, stateBeforeValue, stateAttr: return true } return false } // delim is the delimiter that will end the current HTML attribute. type delim uint8 const ( // delimNone occurs outside any attribute. delimNone delim = iota // delimDoubleQuote occurs when a double quote (") closes the attribute. delimDoubleQuote // delimSingleQuote occurs when a single quote (') closes the attribute. delimSingleQuote // delimSpaceOrTagEnd occurs when a space or right angle bracket (>) // closes the attribute. delimSpaceOrTagEnd ) var delimNames = [...]string{ delimNone: "delimNone", delimDoubleQuote: "delimDoubleQuote", delimSingleQuote: "delimSingleQuote", delimSpaceOrTagEnd: "delimSpaceOrTagEnd", } func (d delim) String() string { if int(d) < len(delimNames) { return delimNames[d] } return fmt.Sprintf("illegal delim %d", int(d)) } // urlPart identifies a part in an RFC 3986 hierarchical URL to allow different // encoding strategies. type urlPart uint8 const ( // urlPartNone occurs when not in a URL, or possibly at the start: // ^ in "^http://auth/path?k=v#frag". urlPartNone urlPart = iota // urlPartPreQuery occurs in the scheme, authority, or path; between the // ^s in "h^ttp://auth/path^?k=v#frag". urlPartPreQuery // urlPartQueryOrFrag occurs in the query portion between the ^s in // "http://auth/path?^k=v#frag^". urlPartQueryOrFrag // urlPartUnknown occurs due to joining of contexts both before and // after the query separator. urlPartUnknown ) var urlPartNames = [...]string{ urlPartNone: "urlPartNone", urlPartPreQuery: "urlPartPreQuery", urlPartQueryOrFrag: "urlPartQueryOrFrag", urlPartUnknown: "urlPartUnknown", } func (u urlPart) String() string { if int(u) < len(urlPartNames) { return urlPartNames[u] } return fmt.Sprintf("illegal urlPart %d", int(u)) } // jsCtx determines whether a '/' starts a regular expression literal or a // division operator. type jsCtx uint8 const ( // jsCtxRegexp occurs where a '/' would start a regexp literal. jsCtxRegexp jsCtx = iota // jsCtxDivOp occurs where a '/' would start a division operator. jsCtxDivOp // jsCtxUnknown occurs where a '/' is ambiguous due to context joining. jsCtxUnknown ) func (c jsCtx) String() string { switch c { case jsCtxRegexp: return "jsCtxRegexp" case jsCtxDivOp: return "jsCtxDivOp" case jsCtxUnknown: return "jsCtxUnknown" } return fmt.Sprintf("illegal jsCtx %d", int(c)) } // element identifies the HTML element when inside a start tag or special body. // Certain HTML element (for example <script> and <style>) have bodies that are // treated differently from stateText so the element type is necessary to // transition into the correct context at the end of a tag and to identify the // end delimiter for the body. type element uint8 const ( // elementNone occurs outside a special tag or special element body. elementNone element = iota // elementScript corresponds to the raw text <script> element. elementScript // elementStyle corresponds to the raw text <style> element. elementStyle // elementTextarea corresponds to the RCDATA <textarea>