// NewDocument parses the HTML data provided through an io.Reader interface. func NewDocument(r io.Reader) (*Document, error) { root, err := html.Parse(r) if err != nil { return nil, err } doc := &Document{ Title: util.NewText(), Chunks: make([]*Chunk, 0, 512), linkText: make(map[*html.Node]int), normText: make(map[*html.Node]int), } // Assign the fields html, head and body from the HTML page. iterateNode(root, func(n *html.Node) int { switch n.DataAtom { case atom.Html: doc.html = n return IterNext case atom.Body: doc.body = n return IterSkip case atom.Head: doc.head = n return IterSkip } // Keep going as long as we're missing some nodes. return IterNext }) switch { case doc.html == nil: return nil, ErrNoHTML case doc.head == nil: return nil, ErrNoHead case doc.body == nil: return nil, ErrNoBody } // Detect the document title: First check if the document provides // Open Graph metadata; if so, use the metadata rather than the // value of the title element, because the metadata tends to be a tad // cleaner. title := "" iterateNode(doc.head, func(n *html.Node) int { if n.Type == html.ElementNode && n.DataAtom == atom.Meta { prop, content := "", "" for _, attr := range n.Attr { switch attr.Key { case "property": prop = attr.Val case "content": content = attr.Val } } if prop == "og:title" && content != "" { title = content return IterStop } } return IterNext }) if title != "" { doc.Title.WriteString(title) } else { iterateNode(doc.head, func(n *html.Node) int { if n.Type == html.ElementNode && n.DataAtom == atom.Title { iterateText(n, doc.Title.WriteString) return IterStop } return IterNext }) } doc.cleanBody(doc.body, 0) doc.countText(doc.body, false) doc.parseBody(doc.body) // Now we link the chunks. min, max := 0, len(doc.Chunks)-1 for i := range doc.Chunks { if i > min { doc.Chunks[i].Prev = doc.Chunks[i-1] } if i < max { doc.Chunks[i].Next = doc.Chunks[i+1] } } return doc, nil }
func NewChunk(doc *Document, n *html.Node) (*Chunk, error) { chunk := new(Chunk) chunk.Text = util.NewText() switch n.Type { // If an ElementNode was passed, create Text property using all // TextNode children. case html.ElementNode: chunk.Base = n // If a TextNode was passed, use the parent ElementNode for the // base field. case html.TextNode: // We don't allow orphaned Chunks. if n.Parent == nil { return nil, ErrNoParent } chunk.Base = n.Parent } // Write the text of all TextNodes of n to chunk.Text. iterateText(n, chunk.Text.WriteString) // Don't produce Chunks without text. if chunk.Text.Len() == 0 { return nil, ErrNoText } // Now we detect the HTML block and container of the base node. The block // is the first block-level element found when ascending from base node. // The container is the first block-level element found when ascending // from the block's parent. // // Example // // Base node is a block-level element: // // <div> <- Container // <p>Hello World</p> <- Base & Block // </div> // // Base node is not a block-level element: // // <div> <- Container // <p> <- Block // <span> // <i>Hello World</i> <- Base // </span> // </p> // </div> if block := getParentBlock(chunk.Base); block != nil { chunk.Block = block } else { return nil, ErrNoBlock } // If there happens to be no block-level element after the block's parent, // use block as container as well. This ensures that the container field // is never nil and we avoid nil pointer handling in our code. if container := getParentBlock(chunk.Block.Parent); container != nil { chunk.Container = container } else { chunk.Container = chunk.Block } // Remember the ancestors in our chunk. chunk.Ancestors = doc.ancestors // Calculate the ratio between text inside links and text outside links // for the current element's block node. This is useful to determine the // quality of a link. Links used as cross references inside the doc // content have a small link text to text ratio, // // <p>Long text .... <a>short text</a> ... </p> // // whereas related content / navigation links have a high link text // to text ratio: // // <li><a>See also: ...</a></li> // linkText := doc.linkText[chunk.Block] normText := doc.normText[chunk.Block] if normText == 0 && linkText == 0 { chunk.LinkText = 0.0 } else { chunk.LinkText = float32(linkText) / float32(linkText+normText) } // Detect the classes of the current node. We use the good old class // attribute and the new HTML5 microdata (itemprop attribute) to determine // the content class. Most IDs aren't really meaningful, so no IDs here. chunk.Classes = make([]string, 0) // Ascend parent nodes until we found a class attribute and some // microdata. haveClass := false haveMicro := false for prev := chunk.Base; prev != nil; prev = prev.Parent { if prev.Type != html.ElementNode { continue } for _, attr := range prev.Attr { switch { case !haveClass && attr.Key == "class": haveClass = true case !haveMicro && attr.Key == "itemprop": haveMicro = true default: continue } // The default: continue case keeps us from reaching this for // attributes we are not interested in. for _, val := range strings.Fields(attr.Val) { chunk.Classes = append(chunk.Classes, val) } } if haveClass && haveMicro { break } } return chunk, nil }
// Extract returns a list of relevant text chunks found in doc. // // How it works // // This function creates a feature vector for each chunk found in doc. // A feature vector contains a numerical representation of the chunk's // properties like HTML element type, parent element type, number of words, // number of sentences and stuff like this. // // A logistic regression model is used to calculate scores based on these // feature vectors. Then, in some kind of meta / ensemble learning approach, // a second type of feature vector is created based on these scores. // This feature vector is fed to our random forest and finally // the random forest's predictions are used to generate the result. // // By now you might have noticed that I'm exceptionally bad at naming and // describing things properly. func (ext *Extractor) Extract(doc *html.Document) (*util.Article, error) { *ext = Extractor{} if len(doc.Chunks) == 0 { return nil, ErrNoChunks } chunkFeatures := make([]chunkFeature, len(doc.Chunks)) boostFeatures := make([]boostFeature, len(doc.Chunks)) // Count the number of words and sentences we encountered for each // class. This helps us to detect elements that contain the doc text. classStats := doc.GetClassStats() clusterStats := doc.GetClusterStats() chunkFeatureWriter := new(chunkFeatureWriter) for i, chunk := range doc.Chunks { chunkFeatureWriter.Assign(chunkFeatures[i][:]) chunkFeatureWriter.WriteElementType(chunk) chunkFeatureWriter.WriteParentType(chunk) chunkFeatureWriter.WriteSiblingTypes(chunk) chunkFeatureWriter.WriteAncestors(chunk) chunkFeatureWriter.WriteTextStat(chunk) chunkFeatureWriter.WriteTextStatSiblings(chunk) chunkFeatureWriter.WriteClassStat(chunk, classStats) chunkFeatureWriter.WriteClusterStat(chunk, clusterStats) } // Detect the minimum and maximum value for each element in the // feature vector. empMin := chunkFeature{} empMax := chunkFeature{} for i := range chunkFeatures { for j, val := range chunkFeatures[i] { switch { case val < empMin[j]: empMin[j] = val case val > empMax[j]: empMax[j] = val } } } // Perform MinMax normalization. for i := range chunkFeatures { feature := &chunkFeatures[i] for j, val := range chunkFeatures[i] { // If the maximum value is not greater than one, we assume that the feature is // already normalized and leave it untouched. if empMax[j] > 1.0 { feature[j] = (val - empMin[j]) / (empMax[j] - empMin[j]) } } } // Now cluster chunks by containers to calculate average score per // container. clusterContainer := newClusterMap() for i, chunk := range doc.Chunks { clusterContainer.Add(chunk.Container, chunk, chunkFeatures[i].Score()) } boostFeatureWriter := new(boostFeatureWriter) for i, chunk := range doc.Chunks { boostFeatureWriter.Assign(boostFeatures[i][:]) boostFeatureWriter.WriteChunk(chunk) boostFeatureWriter.WriteCluster(chunk, clusterContainer[chunk.Container]) boostFeatureWriter.WriteTitleSimilarity(chunk, doc.Title) } // Cluster chunks by block. clusterBlock := newClusterMap() for i, chunk := range doc.Chunks { clusterBlock.Add(chunk.Block, chunk, boostFeatures[i].Score(), float32(chunk.Text.Len())) } // Label all chunks whose blocks have a score above prediction level. // This makes sure that we don't split large blocks. ext.Labels = make([]bool, len(doc.Chunks)) for i, chunk := range doc.Chunks { if cluster, ok := clusterBlock[chunk.Block]; ok { ext.Labels[i] = cluster.Score() > 0.5 } } result := &util.Article{Title: doc.Title.String()} for i, chunk := range doc.Chunks { if cluster, ok := clusterBlock[chunk.Block]; ok && ext.Labels[i] { text := util.NewText() for _, chunk := range cluster.Chunks { text.WriteText(chunk.Text) } if chunk.IsHeading() { result.Append(util.Heading(text.String())) } else { result.Append(util.Paragraph(text.String())) } delete(clusterBlock, chunk.Block) } } if len(result.Text) == 0 { return nil, ErrEmptyResult } return result, nil }