func (bc *broadcast) parseBroadcastFromHtmlNode(root *html.Node) (ret []*r.Broadcast, err error) { { // Author meta, _ := scrape.Find(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "Author" == scrape.Attr(n, "name") }) if nil != meta { content := scrape.Attr(meta, "content") bc.Author = &content } } for idx, epg := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "epg-content-right" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <div class='epg-content-right'/>") return } { // TitleEpisode txt, _ := scrape.Find(epg, func(n *html.Node) bool { return html.TextNode == n.Type && atom.H3 == n.Parent.DataAtom && atom.Br == n.NextSibling.DataAtom }) if nil != txt { t := strings.TrimSpace(r.NormaliseWhiteSpace(txt.Data)) bc.TitleEpisode = &t txt.Parent.RemoveChild(txt.NextSibling) txt.Parent.RemoveChild(txt) } } { // Subject a, _ := scrape.Find(epg, func(n *html.Node) bool { return atom.Div == n.Parent.DataAtom && "sendungsLink" == scrape.Attr(n.Parent, "class") && atom.A == n.DataAtom }) if nil != a { u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = bc.Source.ResolveReference(u) } } // purge some cruft for _, nn := range scrape.FindAll(epg, func(n *html.Node) bool { clz := scrape.Attr(n, "class") return atom.H2 == n.DataAtom || "mod modSharing" == clz || "modGalery" == clz || "sendungsLink" == clz || "tabs-container" == clz }) { nn.Parent.RemoveChild(nn) } { description := r.TextWithBrFromNodeSet(scrape.FindAll(epg, func(n *html.Node) bool { return epg == n.Parent })) bc.Description = &description } } bc_ := r.Broadcast(*bc) ret = append(ret, &bc_) return }
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { nodes := scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "time" == scrape.Attr(n, "class") }) ret = make([]*r.Broadcast, len(nodes)) for index, tim := range nodes { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de bc.Publisher = &publisher // set start time { div_t := strings.TrimSpace(scrape.Text(tim)) if 5 != len(div_t) { continue } hour := r.MustParseInt(div_t[0:2]) minute := r.MustParseInt(div_t[3:5]) bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } for _, tit := range scrape.FindAll(tim.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Div == n.Parent.DataAtom && "descr" == scrape.Attr(n.Parent, "class") }) { // Title bc.Title = strings.TrimSpace(scrape.Text(tit)) href := scrape.Attr(tit, "href") if "" != href { u, _ := url.Parse(href) bc.Subject = day.Source.ResolveReference(u) } desc_node := tit.Parent desc_node.RemoveChild(tit) description := r.TextWithBrFromNodeSet([]*html.Node{desc_node}) bc.Description = &description // fmt.Fprintf(os.Stderr, "\n") } ret[index] = &bc } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if len(nodes) > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[len(nodes)-1].DtEnd = &midnight } return }
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bcs []r.Broadcast, err error) { var bc r.Broadcast bc.Station = bcu.Station bc.Source = bcu.Source { s := "de" bc.Language = &s } // Title, TitleSeries, TitleEpisode for i, h1 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H1 == n.DataAtom && "bcast_headline" == scrape.Attr(n, "class") }) { if i != 0 { err = errors.New("There was more than 1 <h1 class='bcast_headline'>") return } bc.Title = r.TextChildrenNoClimb(h1) for _, span := range scrape.FindAll(h1, func(n *html.Node) bool { return atom.Span == n.DataAtom }) { switch scrape.Attr(span, "class") { case "bcast_overline": s := scrape.Text(span) bc.TitleSeries = &s case "bcast_subtitle": s := scrape.Text(span) bc.TitleEpisode = &s default: err = errors.New("unexpected <span> inside <h1>") return } bc.Title = r.TextChildrenNoClimb(h1) } { description := r.TextWithBrFromNodeSet(scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom && "copytext" == scrape.Attr(n, "class") })) bc.Description = &description } if nil == bc.Image { FoundImage0: for _, di := range scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.Div == n.DataAtom && "teaser media_video embeddedMedia" == scrape.Attr(n, "class") }) { for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) { u, _ := url.Parse(scrape.Attr(img, "src")) bc.Image = bcu.Source.ResolveReference(u) break FoundImage0 } } } if nil == bc.Image { FoundImage1: // test some candidates: for _, no := range []*html.Node{h1.Parent, root} { for _, di := range scrape.FindAll(no, func(n *html.Node) bool { return atom.Div == n.DataAtom && "picturebox" == scrape.Attr(n, "class") }) { for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) { u, _ := url.Parse(scrape.Attr(img, "src")) bc.Image = bcu.Source.ResolveReference(u) break FoundImage1 } } } } } // Time, DtEnd for idx, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "bcast_date" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <p class='bcast_date'>") return } m := bcDateRegExp.FindStringSubmatch(scrape.Text(p)) if nil == m { err = errors.New("There was no date match") return } i := r.MustParseInt bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc) t := time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[6]), i(m[7]), 0, 0, localLoc) if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight t = t.AddDate(0, 0, 1) } bc.DtEnd = &t } // Language for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "og:locale" == scrape.Attr(n, "property") }) { if idx != 0 { err = errors.New("There was more than 1 <meta property='og:locale'/>") return } v := scrape.Attr(meta, "content")[0:2] bc.Language = &v } // Subject for idx, a := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && strings.HasPrefix(scrape.Attr(n, "class"), "link_broadcast media_broadcastSeries") }) { if idx != 0 { err = errors.New("There was more than 1 <a class='link_broadcast media_broadcastSeries'/>") return } u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = bc.Source.ResolveReference(u) } // Modified for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property") }) { if idx != 0 { err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>") return } v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content")) bc.Modified = &v } // Author for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name") }) { if idx != 0 { err = errors.New("There was more than 1 <meta name='author'/>") return } s := scrape.Attr(meta, "content") bc.Author = &s } if "" == bc.Station.Identifier { panic("How can the identifier miss?") } bcs = append(bcs, bc) return }
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bc r.Broadcast, err error) { bc.Station = bcu.Station if "" == bc.Station.Identifier { panic("How can the identifier miss?") } bc.Source = bcu.Source bc.Time = bcu.Time bc.Image = bcu.Image { s := "de" bc.Language = &s } for i, main := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "br-main-text" == scrape.Attr(n, "class") }) { if 1 < i { err = errors.New("unexpected 2nd <div class='br-main-text'> ") return } // Subject for idx, h3 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H3 == n.DataAtom && "Weitere Informationen" == scrape.Text(n) }) { // fmt.Fprintf(os.Stderr, "GET %s\n", "uhu") if idx != 0 { err = errors.New("There was more than 1 <h3>Weitere Informationen") return } for _, a := range scrape.FindAll(h3.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom }) { u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = bc.Source.ResolveReference(u) } h3.Parent.Parent.RemoveChild(h3.Parent) } for i1, h2 := range scrape.FindAll(main, func(n *html.Node) bool { return atom.H2 == n.DataAtom }) { if 1 < i1 { err = errors.New("unexpected 2nd <h2> ") return } for i4, em := range scrape.FindAll(h2, func(n *html.Node) bool { return atom.Em == n.DataAtom }) { if 1 < i4 { err = errors.New("unexpected 2nd <em> ") return } bc.Title = scrape.Text(em) em.Parent.RemoveChild(em) } s := scrape.Text(h2) bc.TitleSeries = &s for i2, h3 := range scrape.FindAll(main, func(n *html.Node) bool { return atom.H3 == n.DataAtom }) { if 1 < i2 { err = errors.New("unexpected 2nd <h3> ") return } s := scrape.Text(h3) bc.TitleEpisode = &s h3.Parent.RemoveChild(h3) } inner := h2.Parent.Parent.Parent h2.Parent.RemoveChild(h2) for ch := inner.FirstChild; ch != nil; ch = ch.NextSibling { if atom.Div == ch.DataAtom { inner.RemoveChild(ch) // once removed NextSibling returns nil } } // Description description := r.TextWithBrFromNodeSet(scrape.FindAll(inner, func(n *html.Node) bool { return atom.P == n.DataAtom || atom.Div == n.DataAtom })) bc.Description = &description } } // DtEnd for _, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "br-time" == scrape.Attr(n, "class") }) { m := bcDateRegExp.FindStringSubmatch(scrape.Text(p)) if nil == m { err = errors.New("There was no date match") return } i := r.MustParseInt // bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc) t := time.Date(bc.Time.Year(), bc.Time.Month(), bc.Time.Day(), i(m[3]), i(m[4]), 0, 0, localLoc) if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight t = t.AddDate(0, 0, 1) } bc.DtEnd = &t } // Modified for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property") }) { if idx != 0 { err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>") return } v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content")) bc.Modified = &v } // Author for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name") }) { if idx != 0 { err = errors.New("There was more than 1 <meta name='author'/>") return } s := scrape.Attr(meta, "content") bc.Author = &s } return }
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { // fmt.Fprintf(os.Stderr, "%s\n", day.Source.String()) index := 0 for _, at := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Td == n.Parent.DataAtom && atom.Tr == n.Parent.Parent.DataAtom && "time" == scrape.Attr(n.Parent, "class") }) { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de { publisher := "http://www.deutschlandfunk.de/" if "drk" == day.Station.Identifier { publisher = "http://www.deutschlandradiokultur.de/" } bc.Publisher = &publisher } // set start time { a_id := scrape.Attr(at, "name") if "" == a_id { continue } bc.Source.Fragment = a_id hour := r.MustParseInt(a_id[0:2]) minute := r.MustParseInt(a_id[2:4]) if 24 < hour || 60 < minute { continue } bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } // Title for idx, h3 := range scrape.FindAll(at.Parent.Parent, func(n *html.Node) bool { return atom.H3 == n.DataAtom && atom.Td == n.Parent.DataAtom && atom.Tr == n.Parent.Parent.DataAtom && "description" == scrape.Attr(n.Parent, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <tr><td class='description'><h3>") return } // purge 'aufnehmen' link: for _, chi := range scrape.FindAll(h3, func(n *html.Node) bool { return atom.A == n.DataAtom && "psradio" == scrape.Attr(n, "class") }) { h3.RemoveChild(chi) } // fmt.Fprintf(os.Stderr, " '%s'\n", scrape.Text(h3)) for idx, h3_a := range scrape.FindAll(h3, func(n *html.Node) bool { return atom.A == n.DataAtom }) { if idx != 0 { err = errors.New("There was more than 1 <tr><td class='description'><h3><a>") return } bc.Title = scrape.Text(h3_a) u, _ := url.Parse(scrape.Attr(h3_a, "href")) bc.Subject = day.Source.ResolveReference(u) } bc.Title = strings.TrimSpace(bc.Title) if "" == bc.Title { bc.Title = r.TextChildrenNoClimb(h3) } // fmt.Fprintf(os.Stderr, " '%s'", bc.Title) { description := r.TextWithBrFromNodeSet(scrape.FindAll(h3.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom })) bc.Description = &description } } // fmt.Fprintf(os.Stderr, "\n") ret = append(ret, &bc) index += 1 } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if index > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[index-1].DtEnd = &midnight } return }