示例#1
0
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title
func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bcs []r.Broadcast, err error) {
	var bc r.Broadcast
	bc.Station = bcu.Station
	bc.Source = bcu.Source
	{
		s := "de"
		bc.Language = &s
	}
	// Title, TitleSeries, TitleEpisode
	for i, h1 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H1 == n.DataAtom && "bcast_headline" == scrape.Attr(n, "class") }) {
		if i != 0 {
			err = errors.New("There was more than 1 <h1 class='bcast_headline'>")
			return
		}
		bc.Title = r.TextChildrenNoClimb(h1)
		for _, span := range scrape.FindAll(h1, func(n *html.Node) bool { return atom.Span == n.DataAtom }) {
			switch scrape.Attr(span, "class") {
			case "bcast_overline":
				s := scrape.Text(span)
				bc.TitleSeries = &s
			case "bcast_subtitle":
				s := scrape.Text(span)
				bc.TitleEpisode = &s
			default:
				err = errors.New("unexpected <span> inside <h1>")
				return
			}
			bc.Title = r.TextChildrenNoClimb(h1)
		}
		{
			description := r.TextWithBrFromNodeSet(scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom && "copytext" == scrape.Attr(n, "class") }))
			bc.Description = &description
		}
		if nil == bc.Image {
		FoundImage0:
			for _, di := range scrape.FindAll(h1.Parent, func(n *html.Node) bool {
				return atom.Div == n.DataAtom && "teaser media_video embeddedMedia" == scrape.Attr(n, "class")
			}) {
				for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) {
					u, _ := url.Parse(scrape.Attr(img, "src"))
					bc.Image = bcu.Source.ResolveReference(u)
					break FoundImage0
				}
			}
		}
		if nil == bc.Image {
		FoundImage1:
			// test some candidates:
			for _, no := range []*html.Node{h1.Parent, root} {
				for _, di := range scrape.FindAll(no, func(n *html.Node) bool { return atom.Div == n.DataAtom && "picturebox" == scrape.Attr(n, "class") }) {
					for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) {
						u, _ := url.Parse(scrape.Attr(img, "src"))
						bc.Image = bcu.Source.ResolveReference(u)
						break FoundImage1
					}
				}
			}
		}
	}

	// Time, DtEnd
	for idx, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "bcast_date" == scrape.Attr(n, "class") }) {
		if idx != 0 {
			err = errors.New("There was more than 1 <p class='bcast_date'>")
			return
		}
		m := bcDateRegExp.FindStringSubmatch(scrape.Text(p))
		if nil == m {
			err = errors.New("There was no date match")
			return
		}
		i := r.MustParseInt
		bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc)
		t := time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[6]), i(m[7]), 0, 0, localLoc)
		if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight
			t = t.AddDate(0, 0, 1)
		}
		bc.DtEnd = &t
	}

	// Language
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "og:locale" == scrape.Attr(n, "property")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta property='og:locale'/>")
			return
		}
		v := scrape.Attr(meta, "content")[0:2]
		bc.Language = &v
	}

	// Subject
	for idx, a := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.A == n.DataAtom && strings.HasPrefix(scrape.Attr(n, "class"), "link_broadcast media_broadcastSeries")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <a class='link_broadcast media_broadcastSeries'/>")
			return
		}
		u, _ := url.Parse(scrape.Attr(a, "href"))
		bc.Subject = bc.Source.ResolveReference(u)
	}

	// Modified
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>")
			return
		}
		v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content"))
		bc.Modified = &v
	}

	// Author
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta name='author'/>")
			return
		}
		s := scrape.Attr(meta, "content")
		bc.Author = &s
	}

	if "" == bc.Station.Identifier {
		panic("How can the identifier miss?")
	}
	bcs = append(bcs, bc)
	return
}