func ParseEntry(r io.Reader) (*AmebloEntry, error) { root, err := html.Parse(r) if err != nil { return nil, err } s, _ := selector.Selector(".articleText") nodes := s.Find(root) if len(nodes) == 0 { return nil, nil } content := h5.RenderNodesToString(nodes) s, _ = selector.Selector("title") nodes = s.Find(root) if len(nodes) == 0 { return nil, nil } title := extractText(nodes[0].FirstChild) entry := &AmebloEntry{ Title: strings.Split(title, "|")[0], Content: content, } return entry, nil }
func parseClosedSearchResult(r io.Reader) ([]string, bool, error) { hasMore := false root, err := html.Parse(r) if err != nil { return nil, false, err } s, _ := css.Selector("td.bi a") nodes := s.Find(root) if len(nodes) > 0 { hasMore = len(nodes) == CLOSED_SEARCH_PARAM_N ids := make([]string, 0) for _, n := range nodes { for _, attr := range n.Attr { if attr.Key == "href" { if _url, err := url.Parse(attr.Val); err == nil { id := _url.Query().Get("aID") if id != "" { ids = append(ids, id) } } } } } return ids, hasMore, nil } else { return nil, false, nil } }
func findOne(sel string, node *html.Node) *html.Node { s, _ := selector.Selector(sel) n := s.Find(node) if n == nil || len(n) == 0 { return nil } return n[0] }
func getTagValueAsString(node *html.Node, selector string) string { s, _ := css.Selector(selector) nodes := s.Find(node) if len(nodes) > 0 { if children := h5.Children(nodes[0]); len(children) > 0 { return children[0].Data } } return "" }
func parseSearchApiResult(r io.Reader) ([]*AuctionItem, bool, error) { hasMore := false root, err := html.Parse(r) if err != nil { return nil, false, err } s, _ := css.Selector("ResultSet") nodes := s.Find(root) if len(nodes) > 0 { var total, returned, firstPos int64 for _, attr := range nodes[0].Attr { switch attr.Key { case "totalresultsavailable": total, _ = strconv.ParseInt(attr.Val, 0, 0) break case "totalresultsreturned": returned, _ = strconv.ParseInt(attr.Val, 0, 0) break case "firstresultposition": firstPos, _ = strconv.ParseInt(attr.Val, 0, 0) break default: break } } hasMore = firstPos+returned-1 < total } else { return nil, false, fmt.Errorf("Invalid response and failed to parse pagination data.") } s, _ = css.Selector("ResultSet Result Item") nodes = s.Find(root) if len(nodes) > 0 { list := make([]*AuctionItem, len(nodes)) for i, n := range nodes { list[i] = parseItemNode(n) } return list, hasMore, nil } return nil, false, nil }
func (t *Transformer) ApplyToFirstMatch(f TransformFunc, sels ...string) error { cs := make([]Collector, 0, len(sels)) for _, sel := range sels { sq, err := selector.Selector(sel) if err != nil { return err } cs = append(cs, sq) } t.ApplyWithCollector(f, FirstMatch(cs...)) return nil }
func extractMatchingHtmlNodes(response *http.Response, cssSelector string) []*html.Node { tree, err := h5.New(response.Body) if err != nil { log.Fatalf("Error parsing body into tree: %v\n", err) } selectorChain, err := selector.Selector(cssSelector) if err != nil { log.Fatalf("Error parsing cssSelector %v: %v\n", cssSelector, err) } return selectorChain.Find(tree.Top()) }
func getTitleNode(document *h5.Tree) (titleNode string, err error) { var chain *selector.Chain if chain, err = selector.Selector("title"); err != nil { return } if matches := chain.Find(document.Top()); len(matches) > 0 { match := matches[0:1] titleNode = h5.RenderNodesToString(match) } return }
func getCanonicalUrl(root *html.Node) string { s, _ := selector.Selector("head link") nodes := s.Find(root) for _, n := range nodes { isCanonical := false href := "" for _, attr := range n.Attr { if attr.Key == "rel" { isCanonical = attr.Val == "canonical" } if attr.Key == "href" { href = attr.Val } } if isCanonical { return href } } return "" }
// Parse the RSS feed and returns iEPG Ids. func ParseRss(r io.Reader) ([]string, error) { root, err := html.Parse(r) if err != nil { return nil, fmt.Errorf("Could not parse RSS feed: %v", err) } s, _ := selector.Selector("item") nodes := s.Find(root) list := []string{} for _, n := range nodes { for i := range n.Attr { if n.Attr[i].Key == "rdf:about" { id := extractIdFromUrl(n.Attr[i].Val) if id != "" { list = append(list, id) } } } } return list, nil }
func getGroupKeyword(n *html.Node) string { s, _ := selector.Selector("#news_detail .icon_box .icon-name") nodes := s.Find(n) if len(nodes) > 0 { s := strings.TrimSpace( html.UnescapeString( extractNodeString(nodes[0]), ), ) // we don't need the year suffix. if strings.HasPrefix(s, "モーニング娘。") { return "モーニング娘。" } // TODO: any other imports than ハロコン? if s == "HELLO! PROJECT" { return "ハロコン" } return s } return "" }
func ParseEntryList(r io.Reader) ([]*AmebloEntry, error) { root, err := html.Parse(r) if err != nil { return nil, err } s, _ := selector.Selector("ul.contentsList li") nodes := s.Find(root) entryList := make([]*AmebloEntry, 0) for _, listItem := range nodes { e := &AmebloEntry{} // title & url n := findOne("a.contentTitle", listItem) e.Title = extractText(n.FirstChild) e.Url = getAttributeValue("href", n) // postAt n = findOne(".contentTime time", listItem) e.PostAt, err = time.ParseInLocation(TIME_FORMAT, extractText(n.FirstChild), JST) if err != nil { continue } div := findOne(".contentDetailArea", listItem) // AmLikes and AmComments n = findOne("a.contentComment", div) if n != nil { e.AmComments, _ = strconv.Atoi( numRegexp.FindString(extractText(n.FirstChild)), ) } n = findOne("a.skinWeakColor", div) if n != nil { e.AmLikes, _ = strconv.Atoi( numRegexp.FindString(extractText(n.FirstChild)), ) } entryList = append(entryList, e) } return entryList, nil }
func rewriteBody(containerSelector string, dest io.Writer, body string) (err error) { if containerSelector == "" { dest.Write([]byte(body)) return } var chain *selector.Chain var document *h5.Tree if document, err = h5.NewFromString(body); err != nil { err = fmt.Errorf("invalid html document: %v", err) return } var titleNode string if titleNode, err = getTitleNode(document); err != nil { return } if chain, err = selector.Selector(containerSelector); err != nil { err = fmt.Errorf("invalid css: %v", containerSelector) return } if matches := chain.Find(document.Top()); len(matches) > 0 { match := matches[0:1] // Take only the first match newBody := h5.RenderNodesToString(h5.Children(match[0])) fmt.Printf("data: %v", h5.Data(match[0])) dest.Write([]byte(titleNode)) dest.Write([]byte(newBody)) return } err = fmt.Errorf("container not found") return }
// Build will construct a Java Docset for Dash, using the Javadoc contained in // the javadocPath provided. func Build(javadocPath string, docsetRoot, docsetName string) error { if exists, err := pathExists(javadocPath); !exists { return errors.New("javadoc path does not exist") } else if err != nil { return err } if exists, err := pathExists(docsetRoot); !exists { return errors.New("docset root path does not exist") } else if err != nil { return err } docsetPath := filepath.Join(docsetRoot, docsetName+".docset") contentsPath := filepath.Join(docsetPath, "Contents") resourcesDir := filepath.Join(contentsPath, "Resources") documentsDir := filepath.Join(resourcesDir, "Documents") if err := os.MkdirAll(documentsDir, 0755); err != nil { return err } if err := copyPath(javadocPath, documentsDir); err != nil { return err } plistPath := filepath.Join(contentsPath, "Info.plist") if err := writePlist(plistPath, docsetName); err != nil { return err } indexFile, err := os.Open(filepath.Join(javadocPath, "index-all.html")) if err != nil { return err } defer indexFile.Close() db, err := initDb(filepath.Join(resourcesDir, "docSet.dsidx")) if err != nil { return err } defer db.Close() tree, err := h5.New(indexFile) if err != nil { return err } itemSelector, err := selector.Selector("dl dt") if err != nil { return err } anchorSelector, err := selector.Selector("a") if err != nil { return err } for _, node := range itemSelector.Find(tree.Top()) { text := nodeText(node, false) anchor := anchorSelector.Find(node)[0] itemType := "" switch { case strings.Contains(text, "Class in"): itemType = "Class" case strings.Contains(text, "Static method in"): itemType = "Method" case strings.Contains(text, "Static variable in"): itemType = "Field" case strings.Contains(text, "Constructor"): itemType = "Constructor" case strings.Contains(text, "Method in"): itemType = "Method" case strings.Contains(text, "Variable in"): itemType = "Field" case strings.Contains(text, "Interface in"): itemType = "Interface" case strings.Contains(text, "Exception in"): itemType = "Exception" case strings.Contains(text, "Error in"): itemType = "Error" case strings.Contains(text, "Enum in"): itemType = "Enum" case strings.Contains(text, "package"): itemType = "Package" case strings.Contains(text, "Annotation Type"): itemType = "Notation" } tx, err := db.Begin() if err != nil { return err } statement, err := tx.Prepare("insert into searchIndex(name, type, path) VALUES(?, ?, ?)") if err != nil { return err } defer statement.Close() if itemType != "" { itemName := nodeText(anchor, true) _, err := statement.Exec(itemName, itemType, nodeAttr(anchor, "href")) if err != nil { return err } } tx.Commit() } return nil }
// Trans creates a Transform that you can apply using ApplyAll. // It takes a TransformFunc and a valid CSS3 Selector. // It returns a *Transform or an error if the selector wasn't valid func Trans(f TransformFunc, sel string) (*Transform, error) { sq, err := selector.Selector(sel) return TransCollector(f, sq), err }
// SubTransform constructs a TransformFunc that runs a TransformFunc on // any nodes in the tree rooted by the node the the TransformFunc is run // against. // This is useful for creating self contained Transforms that are // meant to work on subtrees of the html document. func Subtransform(f TransformFunc, sel string) (TransformFunc, error) { sq, err := selector.Selector(sel) return SubtransformCollector(f, sq), err }
// The ApplyWithSelector method applies a TransformFunc to the nodes matched // by the CSS3 Selector. func (t *Transformer) Apply(f TransformFunc, sel string) error { sq, err := selector.Selector(sel) t.ApplyWithCollector(f, sq) return err }
func (importer *HPEventImporter) Import(root *html.Node) ([]*event.Show, error) { groupName := getGroupKeyword(root) s, _ := selector.Selector("#concert_schedule tr") nodes := s.Find(root) if len(nodes) == 0 { return nil, nil } var idxDate, idxVenue, idxOpen, idxStart int dateBucket := make(map[string][]*event.Show) shows := make([]*event.Show, len(nodes)-1) for i, row := range nodes { columns := h5.Children(row) if i == 0 { // header row for j, col := range columns { val := extractNodeString(col) switch val { case "日程": idxDate = j break case "会場": idxVenue = j break case "開場": idxOpen = j break case "開演": idxStart = j break } } } else { // show row s := &event.Show{} s.VenueName = parseVenue(extractNodeString(columns[idxVenue])) datestr := extractNodeString(columns[idxDate]) s.OpenAt = parseDate(datestr, extractNodeString(columns[idxOpen])) s.StartAt = parseDate(datestr, extractNodeString(columns[idxStart])) s.CreatedAt = time.Now() s.UpdatedAt = s.CreatedAt shows[i-1] = s if bucket, ok := dateBucket[datestr]; !ok { dateBucket[datestr] = []*event.Show{s} } else { dateBucket[datestr] = append(bucket, s) } } } // Set Keywords for _, bucket := range dateBucket { numShowsInDay := len(bucket) if numShowsInDay == 1 { startAt := bucket[0].StartAt bucket[0].YAKeyword = fmt.Sprintf( "%d/%d %s", startAt.Month(), startAt.Day(), groupName, ) } else { for _, show := range bucket { startAt := show.StartAt timeKey := "" if startAt.Hour() < 12 { timeKey = "朝" } else if startAt.Hour() > 17 { timeKey = "夜" } else { timeKey = "昼" } show.YAKeyword = fmt.Sprintf( "%d/%d %s %s", startAt.Month(), startAt.Day(), timeKey, groupName, ) } } } return shows, nil }