func (day *timeURL) parseBroadcastURLsNode(root *html.Node) (ret []*broadcastURL, err error) { const closeDownHour int = 5 for _, h4 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H4 == n.DataAtom }) { year, month, day_, err := timeForH4(scrape.Text(h4), &day.Time) if nil != err { panic(err) } // fmt.Printf("%d-%d-%d %s\n", year, month, day, err) for _, a := range scrape.FindAll(h4.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Dt == n.Parent.DataAtom }) { m := hourMinuteTitleRegExp.FindStringSubmatch(scrape.Text(a)) if nil == m { panic(errors.New("Couldn't parse <a>")) } ur, _ := url.Parse(scrape.Attr(a, "href")) hour := r.MustParseInt(m[1]) dayOffset := 0 if hour < closeDownHour { dayOffset = 1 } // fmt.Printf("%s %s\n", b.r.TimeURL.String(), b.Title) bcu := broadcastURL(r.BroadcastURL{ TimeURL: r.TimeURL{ Time: time.Date(year, month, day_+dayOffset, hour, r.MustParseInt(m[2]), 0, 0, localLoc), Source: *day.Source.ResolveReference(ur), Station: day.Station, }, Title: strings.TrimSpace(m[3]), }) ret = append(ret, &bcu) } } return }
func timeForH4(h4 string, now *time.Time) (year int, mon time.Month, day int, err error) { m := dayMonthRegExp.FindStringSubmatch(h4) if nil == m { // err = error.New("Couldn't parse " + h4) return } mon = time.Month(r.MustParseInt(m[2])) year = yearForMonth(mon, now) day = r.MustParseInt(m[1]) return }
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { nodes := scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "time" == scrape.Attr(n, "class") }) ret = make([]*r.Broadcast, len(nodes)) for index, tim := range nodes { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de bc.Publisher = &publisher // set start time { div_t := strings.TrimSpace(scrape.Text(tim)) if 5 != len(div_t) { continue } hour := r.MustParseInt(div_t[0:2]) minute := r.MustParseInt(div_t[3:5]) bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } for _, tit := range scrape.FindAll(tim.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Div == n.Parent.DataAtom && "descr" == scrape.Attr(n.Parent, "class") }) { // Title bc.Title = strings.TrimSpace(scrape.Text(tit)) href := scrape.Attr(tit, "href") if "" != href { u, _ := url.Parse(href) bc.Subject = day.Source.ResolveReference(u) } desc_node := tit.Parent desc_node.RemoveChild(tit) description := r.TextWithBrFromNodeSet([]*html.Node{desc_node}) bc.Description = &description // fmt.Fprintf(os.Stderr, "\n") } ret[index] = &bc } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if len(nodes) > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[len(nodes)-1].DtEnd = &midnight } return }
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { // fmt.Fprintf(os.Stderr, "%s\n", day.Source.String()) index := 0 for _, at := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "si_dayList_starttime" == scrape.Attr(n, "class") }) { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de bc.Publisher = &publisher empty_str := "" bc.Description = &empty_str // set start time { hhmm := scrape.Text(at) // fmt.Fprintf(os.Stderr, " a_id=%s\n", a_id) hour := r.MustParseInt(hhmm[0:2]) minute := r.MustParseInt(hhmm[3:5]) if 24 < hour || 60 < minute { continue } bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } // Title for idx, div := range scrape.FindAll(at.Parent, func(n *html.Node) bool { return atom.Div == n.DataAtom && "si_dayList_description" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <div class='si_dayList_description'>") return } bc.Title = scrape.Text(div) // u, _ := url.Parse(scrape.Attr(h3_a, "href")) // bc.Subject = day.Source.ResolveReference(u) bc.Title = strings.TrimSpace(bc.Title) for idx1, a := range scrape.FindAll(div, func(n *html.Node) bool { return atom.A == n.DataAtom }) { if idx1 != 0 { err = errors.New("There was more than 1 <a>") return } u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = day.Source.ResolveReference(u) } } // fmt.Fprintf(os.Stderr, "\n") ret = append(ret, &bc) index += 1 } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if index > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[index-1].DtEnd = &midnight } return }
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { // fmt.Fprintf(os.Stderr, "%s\n", day.Source.String()) index := 0 for _, at := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Td == n.Parent.DataAtom && atom.Tr == n.Parent.Parent.DataAtom && "time" == scrape.Attr(n.Parent, "class") }) { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de { publisher := "http://www.deutschlandfunk.de/" if "drk" == day.Station.Identifier { publisher = "http://www.deutschlandradiokultur.de/" } bc.Publisher = &publisher } // set start time { a_id := scrape.Attr(at, "name") if "" == a_id { continue } bc.Source.Fragment = a_id hour := r.MustParseInt(a_id[0:2]) minute := r.MustParseInt(a_id[2:4]) if 24 < hour || 60 < minute { continue } bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } // Title for idx, h3 := range scrape.FindAll(at.Parent.Parent, func(n *html.Node) bool { return atom.H3 == n.DataAtom && atom.Td == n.Parent.DataAtom && atom.Tr == n.Parent.Parent.DataAtom && "description" == scrape.Attr(n.Parent, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <tr><td class='description'><h3>") return } // purge 'aufnehmen' link: for _, chi := range scrape.FindAll(h3, func(n *html.Node) bool { return atom.A == n.DataAtom && "psradio" == scrape.Attr(n, "class") }) { h3.RemoveChild(chi) } // fmt.Fprintf(os.Stderr, " '%s'\n", scrape.Text(h3)) for idx, h3_a := range scrape.FindAll(h3, func(n *html.Node) bool { return atom.A == n.DataAtom }) { if idx != 0 { err = errors.New("There was more than 1 <tr><td class='description'><h3><a>") return } bc.Title = scrape.Text(h3_a) u, _ := url.Parse(scrape.Attr(h3_a, "href")) bc.Subject = day.Source.ResolveReference(u) } bc.Title = strings.TrimSpace(bc.Title) if "" == bc.Title { bc.Title = r.TextChildrenNoClimb(h3) } // fmt.Fprintf(os.Stderr, " '%s'", bc.Title) { description := r.TextWithBrFromNodeSet(scrape.FindAll(h3.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom })) bc.Description = &description } } // fmt.Fprintf(os.Stderr, "\n") ret = append(ret, &bc) index += 1 } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if index > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[index-1].DtEnd = &midnight } return }