func ParseEntry(r io.Reader) (*AmebloEntry, error) { root, err := html.Parse(r) if err != nil { return nil, err } s, _ := selector.Selector(".articleText") nodes := s.Find(root) if len(nodes) == 0 { return nil, nil } content := h5.RenderNodesToString(nodes) s, _ = selector.Selector("title") nodes = s.Find(root) if len(nodes) == 0 { return nil, nil } title := extractText(nodes[0].FirstChild) entry := &AmebloEntry{ Title: strings.Split(title, "|")[0], Content: content, } return entry, nil }
func TestSelectorMatch(t *testing.T) { for _, spec := range matchers { chn, err := Selector(spec.s) if err != nil { t.Errorf("Error parsing selector %q", err) } if !chn.Head.Match(spec.n) { t.Errorf("spec %q didn't match %q when it should have", chn, h5.RenderNodesToString([]*html.Node{spec.n})) } if chn.Head.Match(spec.n2) { t.Errorf("spec %q matched %q when it shouldn't have", chn, h5.RenderNodesToString([]*html.Node{spec.n2})) } } }
func TestSelectorFind(t *testing.T) { for _, spec := range finders { chn, err := Selector(spec.s) if err != nil { t.Errorf("Error parsing selector %q", err) } ns := chn.Find(spec.n) if len(ns) < 1 { t.Errorf("%q didn't find any nodes in %q", chn, h5.RenderNodesToString([]*html.Node{spec.n})) } if h5.RenderNodesToString(ns) != h5.RenderNodesToString(spec.ns) { t.Errorf("%q != %q", h5.RenderNodesToString(ns), h5.RenderNodesToString(spec.ns)) } } }
func CompareNodes(originalNodes, expectedNodes []*html.Node) *Error { if len(originalNodes) != len(expectedNodes) { return &Error{ Description: fmt.Sprintf("Expected node to have %v elements, but got %v", len(expectedNodes), len(originalNodes)), Got: h5.RenderNodesToString(originalNodes), Expected: h5.RenderNodesToString(expectedNodes), } } for i, node := range originalNodes { expectedNode := expectedNodes[i] err := CompareNode(node, expectedNode) if err != nil { return err } } return nil }
func CompareNode(originalNode, expectedNode *html.Node) *Error { err := &Error{ Got: h5.RenderNodesToString([]*html.Node{originalNode}), Expected: h5.RenderNodesToString([]*html.Node{expectedNode}), } if originalNode.Type != expectedNode.Type { err.Description = "Node type does not match" return err } if originalNode.Data != expectedNode.Data { err.Description = "Nodes data does not match" return err } for _, attr := range expectedNode.Attr { attrFound := false attrValueSame := false for _, originalAttr := range originalNode.Attr { if originalAttr.Key == attr.Key { attrFound = true if originalAttr.Val == attr.Val { attrValueSame = true } else { if attr.Key == "class" { attrValueSame = equalWithSeparator(originalAttr.Val, attr.Val, " ") } if attr.Key == "style" { attrValueSame = equalWithSeparator(originalAttr.Val, attr.Val, ";") } } } } if !attrFound { err.Description = fmt.Sprintf("Attribute %v not found in node", attr.Key) return err } if !attrValueSame { err.Description = fmt.Sprintf("Attribute %v value is different", attr.Key) return err } } if len(originalNode.Attr) != len(expectedNode.Attr) { err.Description = "Different number of node attributes" return err } return CompareNodes(h5.Children(originalNode), h5.Children(expectedNode)) }
// Replace constructs a TransformFunc that replaces a node with the nodes passed // in. func Replace(ns ...*html.Node) TransformFunc { return func(n *html.Node) { p := n.Parent switch p { case nil: panic(fmt.Sprintf("Attempt to replace Root node: %s", h5.RenderNodesToString([]*html.Node{n}))) default: for _, nc := range ns { p.InsertBefore(nc, n) } p.RemoveChild(n) } } }
func getTitleNode(document *h5.Tree) (titleNode string, err error) { var chain *selector.Chain if chain, err = selector.Selector("title"); err != nil { return } if matches := chain.Find(document.Top()); len(matches) > 0 { match := matches[0:1] titleNode = h5.RenderNodesToString(match) } return }
func rewriteBody(containerSelector string, dest io.Writer, body string) (err error) { if containerSelector == "" { dest.Write([]byte(body)) return } var chain *selector.Chain var document *h5.Tree if document, err = h5.NewFromString(body); err != nil { err = fmt.Errorf("invalid html document: %v", err) return } var titleNode string if titleNode, err = getTitleNode(document); err != nil { return } if chain, err = selector.Selector(containerSelector); err != nil { err = fmt.Errorf("invalid css: %v", containerSelector) return } if matches := chain.Find(document.Top()); len(matches) > 0 { match := matches[0:1] // Take only the first match newBody := h5.RenderNodesToString(h5.Children(match[0])) fmt.Printf("data: %v", h5.Data(match[0])) dest.Write([]byte(titleNode)) dest.Write([]byte(newBody)) return } err = fmt.Errorf("container not found") return }
func extractText(n *html.Node) string { return h5.RenderNodesToString([]*html.Node{n}) }