Beispiel #1
0
// NewDocument parses the HTML data provided through an io.Reader interface.
func NewDocument(r io.Reader) (*Document, error) {
	root, err := html.Parse(r)
	if err != nil {
		return nil, err
	}

	doc := &Document{
		Title:    util.NewText(),
		Chunks:   make([]*Chunk, 0, 512),
		linkText: make(map[*html.Node]int),
		normText: make(map[*html.Node]int),
	}

	// Assign the fields html, head and body from the HTML page.
	iterateNode(root, func(n *html.Node) int {
		switch n.DataAtom {
		case atom.Html:
			doc.html = n
			return IterNext
		case atom.Body:
			doc.body = n
			return IterSkip
		case atom.Head:
			doc.head = n
			return IterSkip
		}
		// Keep going as long as we're missing some nodes.
		return IterNext
	})

	switch {
	case doc.html == nil:
		return nil, ErrNoHTML
	case doc.head == nil:
		return nil, ErrNoHead
	case doc.body == nil:
		return nil, ErrNoBody
	}

	// Detect the document title: First check if the document provides
	// Open Graph metadata; if so, use the metadata rather than the
	// value of the title element, because the metadata tends to be a tad
	// cleaner.
	title := ""
	iterateNode(doc.head, func(n *html.Node) int {
		if n.Type == html.ElementNode && n.DataAtom == atom.Meta {
			prop, content := "", ""
			for _, attr := range n.Attr {
				switch attr.Key {
				case "property":
					prop = attr.Val
				case "content":
					content = attr.Val
				}
			}
			if prop == "og:title" && content != "" {
				title = content
				return IterStop
			}
		}
		return IterNext
	})
	if title != "" {
		doc.Title.WriteString(title)
	} else {
		iterateNode(doc.head, func(n *html.Node) int {
			if n.Type == html.ElementNode && n.DataAtom == atom.Title {
				iterateText(n, doc.Title.WriteString)
				return IterStop
			}
			return IterNext
		})
	}

	doc.cleanBody(doc.body, 0)
	doc.countText(doc.body, false)
	doc.parseBody(doc.body)

	// Now we link the chunks.
	min, max := 0, len(doc.Chunks)-1
	for i := range doc.Chunks {
		if i > min {
			doc.Chunks[i].Prev = doc.Chunks[i-1]
		}
		if i < max {
			doc.Chunks[i].Next = doc.Chunks[i+1]
		}
	}
	return doc, nil
}
Beispiel #2
0
func NewChunk(doc *Document, n *html.Node) (*Chunk, error) {
	chunk := new(Chunk)
	chunk.Text = util.NewText()

	switch n.Type {
	// If an ElementNode was passed, create Text property using all
	// TextNode children.
	case html.ElementNode:
		chunk.Base = n
	// If a TextNode was passed, use the parent ElementNode for the
	// base field.
	case html.TextNode:
		// We don't allow orphaned Chunks.
		if n.Parent == nil {
			return nil, ErrNoParent
		}
		chunk.Base = n.Parent
	}

	// Write the text of all TextNodes of n to chunk.Text.
	iterateText(n, chunk.Text.WriteString)

	// Don't produce Chunks without text.
	if chunk.Text.Len() == 0 {
		return nil, ErrNoText
	}

	// Now we detect the HTML block and container of the base node. The block
	// is the first block-level element found when ascending from base node.
	// The container is the first block-level element found when ascending
	// from the block's parent.
	//
	// Example
	//
	// Base node is a block-level element:
	//
	//   <div>                        <- Container
	//     <p>Hello World</p>         <- Base & Block
	//   </div>
	//
	// Base node is not a block-level element:
	//
	//   <div>                         <- Container
	//     <p>                         <- Block
	//       <span>
	//         <i>Hello World</i>      <- Base
	//       </span>
	//     </p>
	//   </div>
	if block := getParentBlock(chunk.Base); block != nil {
		chunk.Block = block
	} else {
		return nil, ErrNoBlock
	}

	// If there happens to be no block-level element after the block's parent,
	// use block as container as well. This ensures that the container field
	// is never nil and we avoid nil pointer handling in our code.
	if container := getParentBlock(chunk.Block.Parent); container != nil {
		chunk.Container = container
	} else {
		chunk.Container = chunk.Block
	}

	// Remember the ancestors in our chunk.
	chunk.Ancestors = doc.ancestors

	// Calculate the ratio between text inside links and text outside links
	// for the current element's block node. This is useful to determine the
	// quality of a link. Links used as cross references inside the doc
	// content have a small link text to text ratio,
	//
	//   <p>Long text .... <a>short text</a> ... </p>
	//
	// whereas related content / navigation links have a high link text
	// to text ratio:
	//
	//   <li><a>See also: ...</a></li>
	//
	linkText := doc.linkText[chunk.Block]
	normText := doc.normText[chunk.Block]
	if normText == 0 && linkText == 0 {
		chunk.LinkText = 0.0
	} else {
		chunk.LinkText = float32(linkText) / float32(linkText+normText)
	}

	// Detect the classes of the current node. We use the good old class
	// attribute and the new HTML5 microdata (itemprop attribute) to determine
	// the content class. Most IDs aren't really meaningful, so no IDs here.
	chunk.Classes = make([]string, 0)

	// Ascend parent nodes until we found a class attribute and some
	// microdata.
	haveClass := false
	haveMicro := false
	for prev := chunk.Base; prev != nil; prev = prev.Parent {
		if prev.Type != html.ElementNode {
			continue
		}
		for _, attr := range prev.Attr {
			switch {
			case !haveClass && attr.Key == "class":
				haveClass = true
			case !haveMicro && attr.Key == "itemprop":
				haveMicro = true
			default:
				continue
			}
			// The default: continue case keeps us from reaching this for
			// attributes we are not interested in.
			for _, val := range strings.Fields(attr.Val) {
				chunk.Classes = append(chunk.Classes, val)
			}
		}
		if haveClass && haveMicro {
			break
		}
	}
	return chunk, nil
}
Beispiel #3
0
// Extract returns a list of relevant text chunks found in doc.
//
// How it works
//
// This function creates a feature vector for each chunk found in doc.
// A feature vector contains a numerical representation of the chunk's
// properties like HTML element type, parent element type, number of words,
// number of sentences and stuff like this.
//
// A logistic regression model is used to calculate scores based on these
// feature vectors. Then, in some kind of meta / ensemble learning approach,
// a second type of feature vector is created based on these scores.
// This feature vector is fed to our random forest and finally
// the random forest's predictions are used to generate the result.
//
// By now you might have noticed that I'm exceptionally bad at naming and
// describing things properly.
func (ext *Extractor) Extract(doc *html.Document) (*util.Article, error) {
	*ext = Extractor{}
	if len(doc.Chunks) == 0 {
		return nil, ErrNoChunks
	}

	chunkFeatures := make([]chunkFeature, len(doc.Chunks))
	boostFeatures := make([]boostFeature, len(doc.Chunks))

	// Count the number of words and sentences we encountered for each
	// class. This helps us to detect elements that contain the doc text.
	classStats := doc.GetClassStats()
	clusterStats := doc.GetClusterStats()

	chunkFeatureWriter := new(chunkFeatureWriter)
	for i, chunk := range doc.Chunks {
		chunkFeatureWriter.Assign(chunkFeatures[i][:])
		chunkFeatureWriter.WriteElementType(chunk)
		chunkFeatureWriter.WriteParentType(chunk)
		chunkFeatureWriter.WriteSiblingTypes(chunk)
		chunkFeatureWriter.WriteAncestors(chunk)
		chunkFeatureWriter.WriteTextStat(chunk)
		chunkFeatureWriter.WriteTextStatSiblings(chunk)
		chunkFeatureWriter.WriteClassStat(chunk, classStats)
		chunkFeatureWriter.WriteClusterStat(chunk, clusterStats)
	}

	// Detect the minimum and maximum value for each element in the
	// feature vector.
	empMin := chunkFeature{}
	empMax := chunkFeature{}
	for i := range chunkFeatures {
		for j, val := range chunkFeatures[i] {
			switch {
			case val < empMin[j]:
				empMin[j] = val
			case val > empMax[j]:
				empMax[j] = val
			}
		}
	}

	// Perform MinMax normalization.
	for i := range chunkFeatures {
		feature := &chunkFeatures[i]
		for j, val := range chunkFeatures[i] {
			// If the maximum value is not greater than one, we assume that the feature is
			// already normalized and leave it untouched.
			if empMax[j] > 1.0 {
				feature[j] = (val - empMin[j]) / (empMax[j] - empMin[j])
			}
		}
	}

	// Now cluster chunks by containers to calculate average score per
	// container.
	clusterContainer := newClusterMap()
	for i, chunk := range doc.Chunks {
		clusterContainer.Add(chunk.Container, chunk, chunkFeatures[i].Score())
	}

	boostFeatureWriter := new(boostFeatureWriter)
	for i, chunk := range doc.Chunks {
		boostFeatureWriter.Assign(boostFeatures[i][:])
		boostFeatureWriter.WriteChunk(chunk)
		boostFeatureWriter.WriteCluster(chunk, clusterContainer[chunk.Container])
		boostFeatureWriter.WriteTitleSimilarity(chunk, doc.Title)
	}

	// Cluster chunks by block.
	clusterBlock := newClusterMap()
	for i, chunk := range doc.Chunks {
		clusterBlock.Add(chunk.Block, chunk, boostFeatures[i].Score(), float32(chunk.Text.Len()))
	}

	// Label all chunks whose blocks have a score above prediction level.
	// This makes sure that we don't split large blocks.
	ext.Labels = make([]bool, len(doc.Chunks))
	for i, chunk := range doc.Chunks {
		if cluster, ok := clusterBlock[chunk.Block]; ok {
			ext.Labels[i] = cluster.Score() > 0.5
		}
	}

	result := &util.Article{Title: doc.Title.String()}
	for i, chunk := range doc.Chunks {
		if cluster, ok := clusterBlock[chunk.Block]; ok && ext.Labels[i] {
			text := util.NewText()
			for _, chunk := range cluster.Chunks {
				text.WriteText(chunk.Text)
			}
			if chunk.IsHeading() {
				result.Append(util.Heading(text.String()))
			} else {
				result.Append(util.Paragraph(text.String()))
			}
			delete(clusterBlock, chunk.Block)
		}
	}
	if len(result.Text) == 0 {
		return nil, ErrEmptyResult
	}
	return result, nil
}