示例#1
0
文件: ameblo.go 项目: speedland/lib
func ParseEntry(r io.Reader) (*AmebloEntry, error) {
	root, err := html.Parse(r)
	if err != nil {
		return nil, err
	}
	s, _ := selector.Selector(".articleText")
	nodes := s.Find(root)
	if len(nodes) == 0 {
		return nil, nil
	}
	content := h5.RenderNodesToString(nodes)

	s, _ = selector.Selector("title")
	nodes = s.Find(root)
	if len(nodes) == 0 {
		return nil, nil
	}
	title := extractText(nodes[0].FirstChild)

	entry := &AmebloEntry{
		Title:   strings.Split(title, "|")[0],
		Content: content,
	}
	return entry, nil
}
示例#2
0
文件: crawler.go 项目: speedland/apps
func parseClosedSearchResult(r io.Reader) ([]string, bool, error) {
	hasMore := false
	root, err := html.Parse(r)
	if err != nil {
		return nil, false, err
	}
	s, _ := css.Selector("td.bi a")
	nodes := s.Find(root)
	if len(nodes) > 0 {
		hasMore = len(nodes) == CLOSED_SEARCH_PARAM_N
		ids := make([]string, 0)
		for _, n := range nodes {
			for _, attr := range n.Attr {
				if attr.Key == "href" {
					if _url, err := url.Parse(attr.Val); err == nil {
						id := _url.Query().Get("aID")
						if id != "" {
							ids = append(ids, id)
						}
					}
				}
			}
		}
		return ids, hasMore, nil
	} else {
		return nil, false, nil
	}
}
示例#3
0
文件: ameblo.go 项目: speedland/lib
func findOne(sel string, node *html.Node) *html.Node {
	s, _ := selector.Selector(sel)
	n := s.Find(node)
	if n == nil || len(n) == 0 {
		return nil
	}
	return n[0]
}
示例#4
0
文件: crawler.go 项目: speedland/apps
func getTagValueAsString(node *html.Node, selector string) string {
	s, _ := css.Selector(selector)
	nodes := s.Find(node)
	if len(nodes) > 0 {
		if children := h5.Children(nodes[0]); len(children) > 0 {
			return children[0].Data
		}
	}
	return ""
}
示例#5
0
文件: crawler.go 项目: speedland/apps
func parseSearchApiResult(r io.Reader) ([]*AuctionItem, bool, error) {
	hasMore := false
	root, err := html.Parse(r)
	if err != nil {
		return nil, false, err
	}
	s, _ := css.Selector("ResultSet")
	nodes := s.Find(root)
	if len(nodes) > 0 {
		var total, returned, firstPos int64
		for _, attr := range nodes[0].Attr {
			switch attr.Key {
			case "totalresultsavailable":
				total, _ = strconv.ParseInt(attr.Val, 0, 0)
				break
			case "totalresultsreturned":
				returned, _ = strconv.ParseInt(attr.Val, 0, 0)
				break
			case "firstresultposition":
				firstPos, _ = strconv.ParseInt(attr.Val, 0, 0)
				break
			default:
				break
			}
		}
		hasMore = firstPos+returned-1 < total
	} else {
		return nil, false, fmt.Errorf("Invalid response and failed to parse pagination data.")
	}

	s, _ = css.Selector("ResultSet Result Item")
	nodes = s.Find(root)
	if len(nodes) > 0 {
		list := make([]*AuctionItem, len(nodes))
		for i, n := range nodes {
			list[i] = parseItemNode(n)
		}
		return list, hasMore, nil
	}
	return nil, false, nil
}
示例#6
0
func (t *Transformer) ApplyToFirstMatch(f TransformFunc, sels ...string) error {
	cs := make([]Collector, 0, len(sels))
	for _, sel := range sels {
		sq, err := selector.Selector(sel)
		if err != nil {
			return err
		}
		cs = append(cs, sq)
	}
	t.ApplyWithCollector(f, FirstMatch(cs...))
	return nil
}
示例#7
0
func extractMatchingHtmlNodes(response *http.Response, cssSelector string) []*html.Node {
	tree, err := h5.New(response.Body)
	if err != nil {
		log.Fatalf("Error parsing body into tree: %v\n", err)
	}

	selectorChain, err := selector.Selector(cssSelector)
	if err != nil {
		log.Fatalf("Error parsing cssSelector %v: %v\n", cssSelector, err)
	}

	return selectorChain.Find(tree.Top())
}
示例#8
0
func getTitleNode(document *h5.Tree) (titleNode string, err error) {
	var chain *selector.Chain

	if chain, err = selector.Selector("title"); err != nil {
		return
	}

	if matches := chain.Find(document.Top()); len(matches) > 0 {
		match := matches[0:1]
		titleNode = h5.RenderNodesToString(match)
	}

	return
}
示例#9
0
func getCanonicalUrl(root *html.Node) string {
	s, _ := selector.Selector("head link")
	nodes := s.Find(root)
	for _, n := range nodes {
		isCanonical := false
		href := ""
		for _, attr := range n.Attr {
			if attr.Key == "rel" {
				isCanonical = attr.Val == "canonical"
			}
			if attr.Key == "href" {
				href = attr.Val
			}
		}
		if isCanonical {
			return href
		}
	}
	return ""
}
示例#10
0
文件: iepg.go 项目: speedland/lib
// Parse the RSS feed and returns iEPG Ids.
func ParseRss(r io.Reader) ([]string, error) {
	root, err := html.Parse(r)
	if err != nil {
		return nil, fmt.Errorf("Could not parse RSS feed: %v", err)
	}
	s, _ := selector.Selector("item")
	nodes := s.Find(root)
	list := []string{}
	for _, n := range nodes {
		for i := range n.Attr {
			if n.Attr[i].Key == "rdf:about" {
				id := extractIdFromUrl(n.Attr[i].Val)
				if id != "" {
					list = append(list, id)
				}
			}
		}
	}
	return list, nil
}
示例#11
0
func getGroupKeyword(n *html.Node) string {
	s, _ := selector.Selector("#news_detail .icon_box .icon-name")
	nodes := s.Find(n)
	if len(nodes) > 0 {
		s := strings.TrimSpace(
			html.UnescapeString(
				extractNodeString(nodes[0]),
			),
		)
		// we don't need the year suffix.
		if strings.HasPrefix(s, "モーニング娘。") {
			return "モーニング娘。"
		}
		// TODO: any other imports than ハロコン?
		if s == "HELLO! PROJECT" {
			return "ハロコン"
		}
		return s
	}
	return ""
}
示例#12
0
文件: ameblo.go 项目: speedland/lib
func ParseEntryList(r io.Reader) ([]*AmebloEntry, error) {
	root, err := html.Parse(r)
	if err != nil {
		return nil, err
	}
	s, _ := selector.Selector("ul.contentsList li")
	nodes := s.Find(root)
	entryList := make([]*AmebloEntry, 0)

	for _, listItem := range nodes {
		e := &AmebloEntry{}
		// title & url
		n := findOne("a.contentTitle", listItem)
		e.Title = extractText(n.FirstChild)
		e.Url = getAttributeValue("href", n)
		// postAt
		n = findOne(".contentTime time", listItem)
		e.PostAt, err = time.ParseInLocation(TIME_FORMAT, extractText(n.FirstChild), JST)
		if err != nil {
			continue
		}
		div := findOne(".contentDetailArea", listItem)
		// AmLikes and AmComments
		n = findOne("a.contentComment", div)
		if n != nil {
			e.AmComments, _ = strconv.Atoi(
				numRegexp.FindString(extractText(n.FirstChild)),
			)
		}
		n = findOne("a.skinWeakColor", div)
		if n != nil {
			e.AmLikes, _ = strconv.Atoi(
				numRegexp.FindString(extractText(n.FirstChild)),
			)
		}
		entryList = append(entryList, e)
	}
	return entryList, nil
}
示例#13
0
func rewriteBody(containerSelector string, dest io.Writer, body string) (err error) {
	if containerSelector == "" {
		dest.Write([]byte(body))
		return
	}

	var chain *selector.Chain
	var document *h5.Tree

	if document, err = h5.NewFromString(body); err != nil {
		err = fmt.Errorf("invalid html document: %v", err)
		return
	}

	var titleNode string
	if titleNode, err = getTitleNode(document); err != nil {
		return
	}

	if chain, err = selector.Selector(containerSelector); err != nil {
		err = fmt.Errorf("invalid css: %v", containerSelector)
		return
	}

	if matches := chain.Find(document.Top()); len(matches) > 0 {
		match := matches[0:1] // Take only the first match
		newBody := h5.RenderNodesToString(h5.Children(match[0]))

		fmt.Printf("data: %v", h5.Data(match[0]))

		dest.Write([]byte(titleNode))
		dest.Write([]byte(newBody))
		return
	}

	err = fmt.Errorf("container not found")
	return
}
示例#14
0
// Build will construct a Java Docset for Dash, using the Javadoc contained in
// the javadocPath provided.
func Build(javadocPath string, docsetRoot, docsetName string) error {
	if exists, err := pathExists(javadocPath); !exists {
		return errors.New("javadoc path does not exist")
	} else if err != nil {
		return err
	}

	if exists, err := pathExists(docsetRoot); !exists {
		return errors.New("docset root path does not exist")
	} else if err != nil {
		return err
	}

	docsetPath := filepath.Join(docsetRoot, docsetName+".docset")
	contentsPath := filepath.Join(docsetPath, "Contents")
	resourcesDir := filepath.Join(contentsPath, "Resources")
	documentsDir := filepath.Join(resourcesDir, "Documents")
	if err := os.MkdirAll(documentsDir, 0755); err != nil {
		return err
	}

	if err := copyPath(javadocPath, documentsDir); err != nil {
		return err
	}

	plistPath := filepath.Join(contentsPath, "Info.plist")
	if err := writePlist(plistPath, docsetName); err != nil {
		return err
	}

	indexFile, err := os.Open(filepath.Join(javadocPath, "index-all.html"))
	if err != nil {
		return err
	}

	defer indexFile.Close()

	db, err := initDb(filepath.Join(resourcesDir, "docSet.dsidx"))
	if err != nil {
		return err
	}

	defer db.Close()

	tree, err := h5.New(indexFile)
	if err != nil {
		return err
	}

	itemSelector, err := selector.Selector("dl dt")
	if err != nil {
		return err
	}

	anchorSelector, err := selector.Selector("a")
	if err != nil {
		return err
	}

	for _, node := range itemSelector.Find(tree.Top()) {
		text := nodeText(node, false)
		anchor := anchorSelector.Find(node)[0]
		itemType := ""

		switch {
		case strings.Contains(text, "Class in"):
			itemType = "Class"
		case strings.Contains(text, "Static method in"):
			itemType = "Method"
		case strings.Contains(text, "Static variable in"):
			itemType = "Field"
		case strings.Contains(text, "Constructor"):
			itemType = "Constructor"
		case strings.Contains(text, "Method in"):
			itemType = "Method"
		case strings.Contains(text, "Variable in"):
			itemType = "Field"
		case strings.Contains(text, "Interface in"):
			itemType = "Interface"
		case strings.Contains(text, "Exception in"):
			itemType = "Exception"
		case strings.Contains(text, "Error in"):
			itemType = "Error"
		case strings.Contains(text, "Enum in"):
			itemType = "Enum"
		case strings.Contains(text, "package"):
			itemType = "Package"
		case strings.Contains(text, "Annotation Type"):
			itemType = "Notation"
		}

		tx, err := db.Begin()
		if err != nil {
			return err
		}

		statement, err := tx.Prepare("insert into searchIndex(name, type, path) VALUES(?, ?, ?)")
		if err != nil {
			return err
		}
		defer statement.Close()

		if itemType != "" {
			itemName := nodeText(anchor, true)
			_, err := statement.Exec(itemName, itemType, nodeAttr(anchor, "href"))
			if err != nil {
				return err
			}
		}

		tx.Commit()
	}

	return nil
}
示例#15
0
// Trans creates a Transform that you can apply using ApplyAll.
// It takes a TransformFunc and a valid CSS3 Selector.
// It returns a *Transform or an error if the selector wasn't valid
func Trans(f TransformFunc, sel string) (*Transform, error) {
	sq, err := selector.Selector(sel)
	return TransCollector(f, sq), err
}
示例#16
0
// SubTransform constructs a TransformFunc that runs a TransformFunc on
// any nodes in the tree rooted by the node the the TransformFunc is run
// against.
// This is useful for creating self contained Transforms that are
// meant to work on subtrees of the html document.
func Subtransform(f TransformFunc, sel string) (TransformFunc, error) {
	sq, err := selector.Selector(sel)
	return SubtransformCollector(f, sq), err
}
示例#17
0
// The ApplyWithSelector method applies a TransformFunc to the nodes matched
// by the CSS3 Selector.
func (t *Transformer) Apply(f TransformFunc, sel string) error {
	sq, err := selector.Selector(sel)
	t.ApplyWithCollector(f, sq)
	return err
}
示例#18
0
func (importer *HPEventImporter) Import(root *html.Node) ([]*event.Show, error) {
	groupName := getGroupKeyword(root)
	s, _ := selector.Selector("#concert_schedule tr")
	nodes := s.Find(root)
	if len(nodes) == 0 {
		return nil, nil
	}
	var idxDate, idxVenue, idxOpen, idxStart int

	dateBucket := make(map[string][]*event.Show)
	shows := make([]*event.Show, len(nodes)-1)
	for i, row := range nodes {
		columns := h5.Children(row)
		if i == 0 { // header row
			for j, col := range columns {
				val := extractNodeString(col)
				switch val {
				case "日程":
					idxDate = j
					break
				case "会場":
					idxVenue = j
					break
				case "開場":
					idxOpen = j
					break
				case "開演":
					idxStart = j
					break
				}
			}
		} else { // show row
			s := &event.Show{}
			s.VenueName = parseVenue(extractNodeString(columns[idxVenue]))
			datestr := extractNodeString(columns[idxDate])
			s.OpenAt = parseDate(datestr, extractNodeString(columns[idxOpen]))
			s.StartAt = parseDate(datestr, extractNodeString(columns[idxStart]))
			s.CreatedAt = time.Now()
			s.UpdatedAt = s.CreatedAt
			shows[i-1] = s
			if bucket, ok := dateBucket[datestr]; !ok {
				dateBucket[datestr] = []*event.Show{s}
			} else {
				dateBucket[datestr] = append(bucket, s)
			}
		}
	}
	// Set Keywords
	for _, bucket := range dateBucket {
		numShowsInDay := len(bucket)
		if numShowsInDay == 1 {
			startAt := bucket[0].StartAt
			bucket[0].YAKeyword = fmt.Sprintf(
				"%d/%d %s",
				startAt.Month(),
				startAt.Day(),
				groupName,
			)
		} else {
			for _, show := range bucket {
				startAt := show.StartAt
				timeKey := ""
				if startAt.Hour() < 12 {
					timeKey = "朝"
				} else if startAt.Hour() > 17 {
					timeKey = "夜"
				} else {
					timeKey = "昼"
				}
				show.YAKeyword = fmt.Sprintf(
					"%d/%d %s %s",
					startAt.Month(),
					startAt.Day(),
					timeKey,
					groupName,
				)
			}
		}
	}
	return shows, nil
}