Пример #1
0
func (bc *broadcast) parseBroadcastFromHtmlNode(root *html.Node) (ret []*r.Broadcast, err error) {
	{
		// Author
		meta, _ := scrape.Find(root, func(n *html.Node) bool {
			return atom.Meta == n.DataAtom && "Author" == scrape.Attr(n, "name")
		})
		if nil != meta {
			content := scrape.Attr(meta, "content")
			bc.Author = &content
		}
	}
	for idx, epg := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Div == n.DataAtom && "epg-content-right" == scrape.Attr(n, "class")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <div class='epg-content-right'/>")
			return
		}
		{
			// TitleEpisode
			txt, _ := scrape.Find(epg, func(n *html.Node) bool {
				return html.TextNode == n.Type && atom.H3 == n.Parent.DataAtom && atom.Br == n.NextSibling.DataAtom
			})
			if nil != txt {
				t := strings.TrimSpace(r.NormaliseWhiteSpace(txt.Data))
				bc.TitleEpisode = &t
				txt.Parent.RemoveChild(txt.NextSibling)
				txt.Parent.RemoveChild(txt)
			}
		}
		{
			// Subject
			a, _ := scrape.Find(epg, func(n *html.Node) bool {
				return atom.Div == n.Parent.DataAtom && "sendungsLink" == scrape.Attr(n.Parent, "class") && atom.A == n.DataAtom
			})
			if nil != a {
				u, _ := url.Parse(scrape.Attr(a, "href"))
				bc.Subject = bc.Source.ResolveReference(u)
			}
		}
		// purge some cruft
		for _, nn := range scrape.FindAll(epg, func(n *html.Node) bool {
			clz := scrape.Attr(n, "class")
			return atom.H2 == n.DataAtom ||
				"mod modSharing" == clz ||
				"modGalery" == clz ||
				"sendungsLink" == clz ||
				"tabs-container" == clz
		}) {
			nn.Parent.RemoveChild(nn)
		}
		{
			description := r.TextWithBrFromNodeSet(scrape.FindAll(epg, func(n *html.Node) bool { return epg == n.Parent }))
			bc.Description = &description
		}
	}
	bc_ := r.Broadcast(*bc)
	ret = append(ret, &bc_)
	return
}
Пример #2
0
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) {
	nodes := scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "time" == scrape.Attr(n, "class") })
	ret = make([]*r.Broadcast, len(nodes))
	for index, tim := range nodes {
		// prepare response
		bc := r.Broadcast{
			BroadcastURL: r.BroadcastURL{
				TimeURL: r.TimeURL(*day),
			},
		}
		// some defaults
		bc.Language = &lang_de
		bc.Publisher = &publisher
		// set start time
		{
			div_t := strings.TrimSpace(scrape.Text(tim))
			if 5 != len(div_t) {
				continue
			}
			hour := r.MustParseInt(div_t[0:2])
			minute := r.MustParseInt(div_t[3:5])
			bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone)
			if index > 0 {
				ret[index-1].DtEnd = &bc.Time
			}
		}
		for _, tit := range scrape.FindAll(tim.Parent, func(n *html.Node) bool {
			return atom.A == n.DataAtom && atom.Div == n.Parent.DataAtom && "descr" == scrape.Attr(n.Parent, "class")
		}) {
			// Title
			bc.Title = strings.TrimSpace(scrape.Text(tit))
			href := scrape.Attr(tit, "href")
			if "" != href {
				u, _ := url.Parse(href)
				bc.Subject = day.Source.ResolveReference(u)
			}

			desc_node := tit.Parent
			desc_node.RemoveChild(tit)
			description := r.TextWithBrFromNodeSet([]*html.Node{desc_node})
			bc.Description = &description
			// fmt.Fprintf(os.Stderr, "\n")
		}
		ret[index] = &bc
	}
	// fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String())
	if len(nodes) > 0 {
		midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone)
		ret[len(nodes)-1].DtEnd = &midnight
	}
	return
}
Пример #3
0
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title
func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bcs []r.Broadcast, err error) {
	var bc r.Broadcast
	bc.Station = bcu.Station
	bc.Source = bcu.Source
	{
		s := "de"
		bc.Language = &s
	}
	// Title, TitleSeries, TitleEpisode
	for i, h1 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H1 == n.DataAtom && "bcast_headline" == scrape.Attr(n, "class") }) {
		if i != 0 {
			err = errors.New("There was more than 1 <h1 class='bcast_headline'>")
			return
		}
		bc.Title = r.TextChildrenNoClimb(h1)
		for _, span := range scrape.FindAll(h1, func(n *html.Node) bool { return atom.Span == n.DataAtom }) {
			switch scrape.Attr(span, "class") {
			case "bcast_overline":
				s := scrape.Text(span)
				bc.TitleSeries = &s
			case "bcast_subtitle":
				s := scrape.Text(span)
				bc.TitleEpisode = &s
			default:
				err = errors.New("unexpected <span> inside <h1>")
				return
			}
			bc.Title = r.TextChildrenNoClimb(h1)
		}
		{
			description := r.TextWithBrFromNodeSet(scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom && "copytext" == scrape.Attr(n, "class") }))
			bc.Description = &description
		}
		if nil == bc.Image {
		FoundImage0:
			for _, di := range scrape.FindAll(h1.Parent, func(n *html.Node) bool {
				return atom.Div == n.DataAtom && "teaser media_video embeddedMedia" == scrape.Attr(n, "class")
			}) {
				for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) {
					u, _ := url.Parse(scrape.Attr(img, "src"))
					bc.Image = bcu.Source.ResolveReference(u)
					break FoundImage0
				}
			}
		}
		if nil == bc.Image {
		FoundImage1:
			// test some candidates:
			for _, no := range []*html.Node{h1.Parent, root} {
				for _, di := range scrape.FindAll(no, func(n *html.Node) bool { return atom.Div == n.DataAtom && "picturebox" == scrape.Attr(n, "class") }) {
					for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) {
						u, _ := url.Parse(scrape.Attr(img, "src"))
						bc.Image = bcu.Source.ResolveReference(u)
						break FoundImage1
					}
				}
			}
		}
	}

	// Time, DtEnd
	for idx, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "bcast_date" == scrape.Attr(n, "class") }) {
		if idx != 0 {
			err = errors.New("There was more than 1 <p class='bcast_date'>")
			return
		}
		m := bcDateRegExp.FindStringSubmatch(scrape.Text(p))
		if nil == m {
			err = errors.New("There was no date match")
			return
		}
		i := r.MustParseInt
		bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc)
		t := time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[6]), i(m[7]), 0, 0, localLoc)
		if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight
			t = t.AddDate(0, 0, 1)
		}
		bc.DtEnd = &t
	}

	// Language
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "og:locale" == scrape.Attr(n, "property")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta property='og:locale'/>")
			return
		}
		v := scrape.Attr(meta, "content")[0:2]
		bc.Language = &v
	}

	// Subject
	for idx, a := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.A == n.DataAtom && strings.HasPrefix(scrape.Attr(n, "class"), "link_broadcast media_broadcastSeries")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <a class='link_broadcast media_broadcastSeries'/>")
			return
		}
		u, _ := url.Parse(scrape.Attr(a, "href"))
		bc.Subject = bc.Source.ResolveReference(u)
	}

	// Modified
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>")
			return
		}
		v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content"))
		bc.Modified = &v
	}

	// Author
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta name='author'/>")
			return
		}
		s := scrape.Attr(meta, "content")
		bc.Author = &s
	}

	if "" == bc.Station.Identifier {
		panic("How can the identifier miss?")
	}
	bcs = append(bcs, bc)
	return
}
Пример #4
0
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title
func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bc r.Broadcast, err error) {
	bc.Station = bcu.Station
	if "" == bc.Station.Identifier {
		panic("How can the identifier miss?")
	}
	bc.Source = bcu.Source
	bc.Time = bcu.Time
	bc.Image = bcu.Image
	{
		s := "de"
		bc.Language = &s
	}

	for i, main := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "br-main-text" == scrape.Attr(n, "class") }) {
		if 1 < i {
			err = errors.New("unexpected 2nd <div class='br-main-text'> ")
			return
		}

		// Subject
		for idx, h3 := range scrape.FindAll(root, func(n *html.Node) bool {
			return atom.H3 == n.DataAtom && "Weitere Informationen" == scrape.Text(n)
		}) {
			// fmt.Fprintf(os.Stderr, "GET %s\n", "uhu")
			if idx != 0 {
				err = errors.New("There was more than 1 <h3>Weitere Informationen")
				return
			}
			for _, a := range scrape.FindAll(h3.Parent, func(n *html.Node) bool {
				return atom.A == n.DataAtom
			}) {
				u, _ := url.Parse(scrape.Attr(a, "href"))
				bc.Subject = bc.Source.ResolveReference(u)
			}

			h3.Parent.Parent.RemoveChild(h3.Parent)
		}

		for i1, h2 := range scrape.FindAll(main, func(n *html.Node) bool { return atom.H2 == n.DataAtom }) {
			if 1 < i1 {
				err = errors.New("unexpected 2nd <h2> ")
				return
			}
			for i4, em := range scrape.FindAll(h2, func(n *html.Node) bool { return atom.Em == n.DataAtom }) {
				if 1 < i4 {
					err = errors.New("unexpected 2nd <em> ")
					return
				}
				bc.Title = scrape.Text(em)
				em.Parent.RemoveChild(em)
			}
			s := scrape.Text(h2)
			bc.TitleSeries = &s

			for i2, h3 := range scrape.FindAll(main, func(n *html.Node) bool { return atom.H3 == n.DataAtom }) {
				if 1 < i2 {
					err = errors.New("unexpected 2nd <h3> ")
					return
				}
				s := scrape.Text(h3)
				bc.TitleEpisode = &s
				h3.Parent.RemoveChild(h3)
			}

			inner := h2.Parent.Parent.Parent
			h2.Parent.RemoveChild(h2)

			for ch := inner.FirstChild; ch != nil; ch = ch.NextSibling {
				if atom.Div == ch.DataAtom {
					inner.RemoveChild(ch) // once removed NextSibling returns nil
				}
			}

			// Description
			description := r.TextWithBrFromNodeSet(scrape.FindAll(inner, func(n *html.Node) bool { return atom.P == n.DataAtom || atom.Div == n.DataAtom }))
			bc.Description = &description
		}
	}

	// DtEnd
	for _, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "br-time" == scrape.Attr(n, "class") }) {
		m := bcDateRegExp.FindStringSubmatch(scrape.Text(p))
		if nil == m {
			err = errors.New("There was no date match")
			return
		}
		i := r.MustParseInt
		// bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc)
		t := time.Date(bc.Time.Year(), bc.Time.Month(), bc.Time.Day(), i(m[3]), i(m[4]), 0, 0, localLoc)
		if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight
			t = t.AddDate(0, 0, 1)
		}
		bc.DtEnd = &t
	}

	// Modified
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>")
			return
		}
		v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content"))
		bc.Modified = &v
	}

	// Author
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta name='author'/>")
			return
		}
		s := scrape.Attr(meta, "content")
		bc.Author = &s
	}

	return
}
Пример #5
0
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) {
	// fmt.Fprintf(os.Stderr, "%s\n", day.Source.String())
	index := 0
	for _, at := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.A == n.DataAtom &&
			atom.Td == n.Parent.DataAtom &&
			atom.Tr == n.Parent.Parent.DataAtom &&
			"time" == scrape.Attr(n.Parent, "class")
	}) {
		// prepare response
		bc := r.Broadcast{
			BroadcastURL: r.BroadcastURL{
				TimeURL: r.TimeURL(*day),
			},
		}

		// some defaults
		bc.Language = &lang_de
		{
			publisher := "http://www.deutschlandfunk.de/"
			if "drk" == day.Station.Identifier {
				publisher = "http://www.deutschlandradiokultur.de/"
			}
			bc.Publisher = &publisher
		}
		// set start time
		{
			a_id := scrape.Attr(at, "name")
			if "" == a_id {
				continue
			}
			bc.Source.Fragment = a_id
			hour := r.MustParseInt(a_id[0:2])
			minute := r.MustParseInt(a_id[2:4])
			if 24 < hour || 60 < minute {
				continue
			}
			bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone)
			if index > 0 {
				ret[index-1].DtEnd = &bc.Time
			}
		}
		// Title
		for idx, h3 := range scrape.FindAll(at.Parent.Parent, func(n *html.Node) bool {
			return atom.H3 == n.DataAtom &&
				atom.Td == n.Parent.DataAtom &&
				atom.Tr == n.Parent.Parent.DataAtom &&
				"description" == scrape.Attr(n.Parent, "class")
		}) {
			if idx != 0 {
				err = errors.New("There was more than 1 <tr><td class='description'><h3>")
				return
			}
			// purge 'aufnehmen' link:
			for _, chi := range scrape.FindAll(h3, func(n *html.Node) bool {
				return atom.A == n.DataAtom &&
					"psradio" == scrape.Attr(n, "class")
			}) {
				h3.RemoveChild(chi)
			}
			// fmt.Fprintf(os.Stderr, " '%s'\n", scrape.Text(h3))

			for idx, h3_a := range scrape.FindAll(h3, func(n *html.Node) bool {
				return atom.A == n.DataAtom
			}) {
				if idx != 0 {
					err = errors.New("There was more than 1 <tr><td class='description'><h3><a>")
					return
				}
				bc.Title = scrape.Text(h3_a)
				u, _ := url.Parse(scrape.Attr(h3_a, "href"))
				bc.Subject = day.Source.ResolveReference(u)
			}
			bc.Title = strings.TrimSpace(bc.Title)
			if "" == bc.Title {
				bc.Title = r.TextChildrenNoClimb(h3)
			}
			// fmt.Fprintf(os.Stderr, " '%s'", bc.Title)
			{
				description := r.TextWithBrFromNodeSet(scrape.FindAll(h3.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom }))
				bc.Description = &description
			}
		}
		// fmt.Fprintf(os.Stderr, "\n")
		ret = append(ret, &bc)
		index += 1
	}
	// fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String())
	if index > 0 {
		midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone)
		ret[index-1].DtEnd = &midnight
	}
	return
}