func (bcu *calItemRangeURL) parseBroadcastsFromData(programm B3Programm) (bcs []r.Broadcast, err error) { language := "de" author := "Bayerischer Rundfunk" empty := "" for _, b3 := range programm.Broadcasts { // fill one broadcast from JSON to r.Broadcast b := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL{ Station: bcu.Station, Source: bcu.Source}, }, Language: &language, Author: &author, // Creator: &bcu.Station.Name, // Copyright: &bcu.Station.Name, Description: &empty, } stripPrefix := func(s string) *string { if "" == s { return nil } if strings.HasPrefix(s, "BAYERN 3 - ") { s = s[len("BAYERN 3 - "):len(s)] } if strings.HasPrefix(s, "BAYERN 3 ") { s = s[len("BAYERN 3 "):len(s)] } return &s } b.Title = *stripPrefix(b3.Headline) b.TitleSeries = stripPrefix(b3.BroadcastSeriesName) b.TitleEpisode = stripPrefix(b3.SubTitle) { // Time (start) start, err0 := time.Parse(time.RFC3339, b3.StartTime) err = err0 if nil != err { continue } b.Time = start } { // DtEnd end, err1 := time.Parse(time.RFC3339, b3.EndTime) err = err1 if nil != err { continue } b.DtEnd = &end } bcs = append(bcs, b) } return }
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bcs []r.Broadcast, err error) { var bc r.Broadcast bc.Station = bcu.Station bc.Source = bcu.Source { s := "de" bc.Language = &s } // Title, TitleSeries, TitleEpisode for i, h1 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H1 == n.DataAtom && "bcast_headline" == scrape.Attr(n, "class") }) { if i != 0 { err = errors.New("There was more than 1 <h1 class='bcast_headline'>") return } bc.Title = r.TextChildrenNoClimb(h1) for _, span := range scrape.FindAll(h1, func(n *html.Node) bool { return atom.Span == n.DataAtom }) { switch scrape.Attr(span, "class") { case "bcast_overline": s := scrape.Text(span) bc.TitleSeries = &s case "bcast_subtitle": s := scrape.Text(span) bc.TitleEpisode = &s default: err = errors.New("unexpected <span> inside <h1>") return } bc.Title = r.TextChildrenNoClimb(h1) } { description := r.TextWithBrFromNodeSet(scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom && "copytext" == scrape.Attr(n, "class") })) bc.Description = &description } if nil == bc.Image { FoundImage0: for _, di := range scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.Div == n.DataAtom && "teaser media_video embeddedMedia" == scrape.Attr(n, "class") }) { for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) { u, _ := url.Parse(scrape.Attr(img, "src")) bc.Image = bcu.Source.ResolveReference(u) break FoundImage0 } } } if nil == bc.Image { FoundImage1: // test some candidates: for _, no := range []*html.Node{h1.Parent, root} { for _, di := range scrape.FindAll(no, func(n *html.Node) bool { return atom.Div == n.DataAtom && "picturebox" == scrape.Attr(n, "class") }) { for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) { u, _ := url.Parse(scrape.Attr(img, "src")) bc.Image = bcu.Source.ResolveReference(u) break FoundImage1 } } } } } // Time, DtEnd for idx, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "bcast_date" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <p class='bcast_date'>") return } m := bcDateRegExp.FindStringSubmatch(scrape.Text(p)) if nil == m { err = errors.New("There was no date match") return } i := r.MustParseInt bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc) t := time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[6]), i(m[7]), 0, 0, localLoc) if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight t = t.AddDate(0, 0, 1) } bc.DtEnd = &t } // Language for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "og:locale" == scrape.Attr(n, "property") }) { if idx != 0 { err = errors.New("There was more than 1 <meta property='og:locale'/>") return } v := scrape.Attr(meta, "content")[0:2] bc.Language = &v } // Subject for idx, a := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && strings.HasPrefix(scrape.Attr(n, "class"), "link_broadcast media_broadcastSeries") }) { if idx != 0 { err = errors.New("There was more than 1 <a class='link_broadcast media_broadcastSeries'/>") return } u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = bc.Source.ResolveReference(u) } // Modified for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property") }) { if idx != 0 { err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>") return } v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content")) bc.Modified = &v } // Author for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name") }) { if idx != 0 { err = errors.New("There was more than 1 <meta name='author'/>") return } s := scrape.Attr(meta, "content") bc.Author = &s } if "" == bc.Station.Identifier { panic("How can the identifier miss?") } bcs = append(bcs, bc) return }