// TODO: (eg newsquest sites) // <span data-format="article-display" data-show-date="always" data-show-time="today-only" data-timestamp="1461211200" itemprop="datePublished" class="timestamp formatTimeStamp" full-date="20.04.2016">20 mins ago</span> // // // func grabDates(root *html.Node, artURL *url.URL, contentNodes []*html.Node, headlineNode *html.Node, scriptNodes []*html.Node, cruftBlocks []*html.Node) (fuzzytime.DateTime, fuzzytime.DateTime) { dbug := Debug.DatesLogger var publishedCandidates = make(dateCandidateList, 0, 32) var updatedCandidates = make(dateCandidateList, 0, 32) // there might be an obvious date in the URL urlDate := dateFromURL(artURL) // look for timestamps in <meta> tags metaPublished, metaUpdated := datesFromMeta(root) if metaPublished.HasFullDate() && metaUpdated.HasFullDate() { return metaPublished, metaUpdated } evilPublished := checkEvilSpecialCaseHacks(artURL, scriptNodes) // get a list of elements between headline and content betwixt := []*html.Node{} if headlineNode != nil && len(contentNodes) > 0 { var err error betwixt, err = interveningElements(headlineNode, contentNodes[0]) if err != nil { betwixt = []*html.Node{} } } betwixtValue := 1.0 for _, node := range dateSels.tags.MatchAll(root) { var txt string // a couple of cases where we want text from attrs instead switch node.DataAtom { case atom.Time: txt = getAttr(node, "datetime") if txt == "" { txt = getTextContent(node) } case atom.Abbr: txt = getAttr(node, "title") if txt == "" { txt = getTextContent(node) } default: // check for obvious machine-readable timestamps foo := getAttr(node, "data-timestamp") if foo != "" { i, err := strconv.ParseInt(foo, 10, 64) if err == nil { tm := time.Unix(i, 0).UTC() if tm.Year() < 10000 { // OK, looks sensible(ish). We'll use it. // Cheesy hack - pass it on as text for re-parsing! txt = tm.Format(time.RFC3339) } // else probable javascript timestamp TODO: divide by 1000 and try again! } } if txt == "" { txt = getTextContent(node) } } if len(txt) < 6 || len(txt) > 150 { continue // too short } // got some date/time info? dt, spans, _ := fuzzytime.WesternContext.Extract(txt) if dt.Empty() { continue // no data, (or there was an error) } //dbug.Printf("considering %s (%s) '%f'\n", describeNode(node), dt.String(), dateProportion) publishedC := newDateCandidate(node, txt, dt) updatedC := newDateCandidate(node, txt, dt) var dateProportion float64 if node.DataAtom == atom.P { // for paragraphs, calculate proportion of text which is date/time mcnt := 0 for _, span := range spans { mcnt += span.End - span.Begin } dateProportion = float64(mcnt) / float64(len(txt)) if dateProportion < 0.5 { continue // too much text, not enough date. } } // prefer datetimes over just dates (or times) if dt.HasYear() && dt.HasMonth() && dt.HasDay() { if dt.HasHour() && dt.HasMinute() { publishedC.addPoints(0.75, "datetime") updatedC.addPoints(0.75, "datetime") } } if dt.Date.Empty() { publishedC.addPoints(-0.5, "no date") updatedC.addPoints(-0.5, "no date") } // TEST: is machine readable? if node.DataAtom == atom.Time { publishedC.addPoints(1, "<time>") updatedC.addPoints(1, "<time>") } // TEST: indicative text ("posted:" etc...) if datePats.publishedIndicativeText.MatchString(txt) { publishedC.addPoints(1, "indicative text") } // TEST: indicative text ("posted:" etc...) if datePats.updatedIndicativeText.MatchString(txt) { updatedC.addPoints(1, "indicative text") } // TEST: hAtom date markup if dateSels.hatomPublished.Match(node) { publishedC.addPoints(2, "hentry .published") } if dateSels.hatomUpdated.Match(node) { publishedC.addPoints(2, "hentry .updated") } // TEST: likely class or id? if datePats.genericClasses.MatchString(getAttr(node, "class")) { updatedC.addPoints(1, "likely class") publishedC.addPoints(1, "likely class") } if datePats.genericClasses.MatchString(getAttr(node, "id")) { updatedC.addPoints(1, "likely id") publishedC.addPoints(1, "likely id") } // TEST: likely class or id for published? if datePats.publishedClasses.MatchString(getAttr(node, "class")) { publishedC.addPoints(1, "likely class for published") } if datePats.publishedClasses.MatchString(getAttr(node, "id")) { publishedC.addPoints(1, "likely id for published") } // TEST: likely class or id for updated? if datePats.updatedClasses.MatchString(getAttr(node, "class")) { updatedC.addPoints(1, "likely class for updated") } if datePats.updatedClasses.MatchString(getAttr(node, "id")) { updatedC.addPoints(1, "likely id for updated") } // TEST: RDFa property="dc:issued" (issued, updated, created etc) if dateSels.rdfaPublished.Match(node) { publishedC.addPoints(1, "likely rdfa markup for published") } if dateSels.rdfaUpdated.Match(node) { updatedC.addPoints(1, "likely rdfa markup for updated") } // TEST: within article content? for _, contentNode := range contentNodes { if contains(contentNode, node) { publishedC.addPoints(1, "contained within content") updatedC.addPoints(1, "contained within content") } } // TEST: share a parent with content? for _, contentNode := range contentNodes { if contains(contentNode.Parent, node) { publishedC.addPoints(1, "near content") updatedC.addPoints(1, "near content") } } // TEST: within a crfut block? (comment, social link, whatever) for _, cruftBlock := range cruftBlocks { if contains(cruftBlock, node) { desc := fmt.Sprintf("inside cruft (%s)", describeNode(cruftBlock)) publishedC.addPoints(-3, desc) updatedC.addPoints(-3, desc) } } // TODO: TEST: agrees with <meta> tag values? // TEST: between headline and content? for _, e := range betwixt { if e == node { // first one preferred updatedC.addPoints(betwixtValue, "between headline and content") publishedC.addPoints(betwixtValue, "between headline and content") betwixtValue *= 0.9 break } } // TEST: matches date info in URL? // (if not, fill in any missing fields using the URL date!) if !urlDate.Empty() { if urlDate.Conflicts(&dt.Date) { updatedC.addPoints(-1, "clash with date in url") publishedC.addPoints(-1, "clash with date in url") } else { dt.Date.Merge(&urlDate) updatedC.dt = dt publishedC.dt = dt } } // TODO: TEST - proximity to top or bottom of article content // TODO: check for value-title pattern? if publishedC.total() > 0 { publishedCandidates = append(publishedCandidates, publishedC) } if updatedC.total() > 0 { updatedCandidates = append(updatedCandidates, updatedC) } } dbug.Printf("date from url: %s\n", urlDate.String()) dbug.Printf("meta updated: %s\n", metaUpdated.String()) dbug.Printf("meta published: %s\n", metaPublished.String()) if !evilPublished.Empty() { dbug.Printf("evilspecialcase published: '%s'\n", evilPublished.String()) } publishedCandidates.Sort() dbug.Printf("PUBLISHED: %d candidates\n", len(publishedCandidates)) for _, c := range publishedCandidates { c.dump(dbug) } updatedCandidates.Sort() dbug.Printf("UPDATED: %d candidates\n", len(updatedCandidates)) for _, c := range updatedCandidates { c.dump(dbug) } var published, updated fuzzytime.DateTime // pick best candidate for published if best, err := publishedCandidates.TopDate(); err == nil { published = best.dt } else { dbug.Printf("published: Didn't pick any (%s)", err) } if published.Empty() { if !metaPublished.Empty() { published = metaPublished } else if !urlDate.Empty() { published = fuzzytime.DateTime{Date: urlDate} } else if !evilPublished.Empty() { published = evilPublished } } // updated: use meta data if present if metaUpdated.HasFullDate() { updated = metaUpdated } else { if best, err := updatedCandidates.TopDate(); err == nil { updated = best.dt // if time only, use date from published if updated.Date.Empty() && !updated.Time.Empty() { updated.Date = published.Date } } else { dbug.Printf("updated: Didn't pick any (%s)", err) } } return published, updated }