Beispiel #1
0
func FiveThousandBest() (titles []string, err error) {
	res, err := http.Get("http://5000best.com/movies/1")
	if err != nil {
		return
	}

	body, err := ioutil.ReadAll(res.Body)
	if err != nil {
		return
	}

	doc, err := gokogiri.ParseHtml(body)
	if err != nil {
		return
	}

	exp := xpath.Compile("//a[@class='n']")

	nodes := doc.XPathCtx.Evaluate(doc.NodePtr(), exp)

	for _, np := range nodes {
		node := xml.NewNode(np, doc)
		title := node.InnerHtml()
		title = title[:len(title)-8]
		titles = append(titles, title)
	}

	return
}
Beispiel #2
0
func NewRecord(content []byte) (record *Record) {
	doc, err := gokogiri.ParseHtml([]byte(content))
	if err != nil {
		panic(err)
	}

	displayText := cleanUpContent(doc.String())
	record = &Record{RawText: content, DisplayText: displayText}
	dateStr := getInterp(doc.Root().NodePtr(), "date", doc)
	date, err := time.Parse("20060102", dateStr)
	if err != nil {
		record.Date = nil
	} else {
		record.Date = &date
	}

	xPath := xpath.NewXPath(doc.DocPtr())
	nodePtrs := xPath.Evaluate(doc.Root().NodePtr(),
		xpath.Compile("//div1"))

	node := xml.NewNode(nodePtrs[0], doc)
	record.Id = node.Attr("id")
	record.Type = node.Attr("type")

	record.processPersons(doc)
	record.processOffences(doc)
	record.processVerdicts(doc)
	record.processOffJoins(doc)
	return
}
Beispiel #3
0
func (record *Record) processOffJoins(doc *html.HtmlDocument) {
	xPath := xpath.NewXPath(doc.DocPtr())
	// join the offence with the defendants and verdict
	joinPtrs := xPath.Evaluate(doc.Root().NodePtr(),
		xpath.Compile("//join[@result='criminalCharge']"))

	for _, nodePtr := range joinPtrs {
		node := xml.NewNode(nodePtr, doc)
		targets := strings.Split(node.Attr("targets"), " ")
		var personId, offId, verdictId string
		for _, targetId := range targets {
			if strings.Contains(targetId, "defend") {
				personId = targetId
			}
			if strings.Contains(targetId, "off") {
				offId = targetId
			}
			if strings.Contains(targetId, "verdict") {
				verdictId = targetId
			}
		}
		offence := record.findOffence(offId)
		if offence == nil {
			panic("couldn't find offence " + offId)
		}
		person := record.findPerson(personId)
		if person != nil {
			offence.Defendants = append(offence.Defendants, person)
		}
		verdict := record.findVerdict(verdictId)
		if verdict != nil {
			offence.Verdict = verdict
		}
	}
}
Beispiel #4
0
// get the value out of an <interp> tag
func getInterp(basePtr unsafe.Pointer, interpType string, doc *html.HtmlDocument) (value string) {
	xPath := xpath.NewXPath(doc.DocPtr())
	nodePtrs := xPath.Evaluate(basePtr, xpath.Compile(".//interp[@type='"+
		interpType+"']"))
	if len(nodePtrs) == 1 {
		node := xml.NewNode(nodePtrs[0], doc)
		value = node.Attr("value")
	}
	return
}
Beispiel #5
0
func (record *Record) processVerdicts(doc *html.HtmlDocument) {
	xPath := xpath.NewXPath(doc.DocPtr())
	verdictPtrs := xPath.Evaluate(doc.Root().NodePtr(),
		xpath.Compile("//rs[@type='verdictDescription']"))
	verdicts := make([]Verdict, len(verdictPtrs))

	for i, nodePtr := range verdictPtrs {
		node := xml.NewNode(nodePtr, doc)
		verdict := Verdict{}
		verdict.Id = node.Attr("id")
		verdict.Desc = cleanUpContent(node.Content())
		verdict.SetType(getInterp(nodePtr, "verdictCategory", doc))
		verdicts[i] = verdict
	}
	record.Verdicts = verdicts
}
Beispiel #6
0
func (record *Record) processOffences(doc *html.HtmlDocument) {
	xPath := xpath.NewXPath(doc.DocPtr())
	offencePtrs := xPath.Evaluate(doc.Root().NodePtr(),
		xpath.Compile("//rs[@type='offenceDescription']"))
	offences := make([]Offence, len(offencePtrs))

	for i, nodePtr := range offencePtrs {
		node := xml.NewNode(nodePtr, doc)
		offence := Offence{}
		offence.Id = node.Attr("id")
		offence.Category = getInterp(nodePtr, "offenceCategory", doc)
		offence.SubCategory = getInterp(nodePtr, "offenceSubcategory", doc)
		offence.Desc = cleanUpContent(node.Content())
		offences[i] = offence
	}
	record.Offences = offences
}
Beispiel #7
0
func (record *Record) processPersons(doc *html.HtmlDocument) {
	xPath := xpath.NewXPath(doc.DocPtr())
	personPtrs := xPath.Evaluate(doc.Root().NodePtr(),
		xpath.Compile("//persname"))
	persons := make([]Person, len(personPtrs))

	for i, nodePtr := range personPtrs {
		node := xml.NewNode(nodePtr, doc)
		person := Person{}
		person.Id = node.Attr("id")
		person.GivenName = getInterp(nodePtr, "given", doc)
		person.Surname = getInterp(nodePtr, "surname", doc)
		person.SetType(node.Attr("type"))
		person.SetGender(getInterp(nodePtr, "gender", doc))
		persons[i] = person
	}
	record.Persons = persons
}
Beispiel #8
0
func (store *Store) LoadPrice(url string) (price float64, err error) {
	resp, err := http.Get(url)
	if err != nil {
		return
	}

	defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)

	if err != nil {
		return
	}

	doc, err := gokogiri.ParseHtml(body)

	if err != nil {
		return
	}

	nxpath := xpath.NewXPath(doc.DocPtr())
	nodes, err := nxpath.Evaluate(doc.DocPtr(), store.compiledXPath)

	if err != nil {
		return
	}

	if len(nodes) == 0 {
		fmt.Printf("Check XPath correctness (not found) for domain: %s\n", store.Domain)
		return
	}

	price_raw := xml.NewNode(nodes[0], doc).InnerHtml()
	price_raw = strings.Trim(price_raw, "$ \n\r")
	price, err = strconv.ParseFloat(price_raw, 64)

	if err != nil {
		fmt.Printf("Check XPath correctness (not monetary) for domain: %s\n", store.Domain)
		return
	}

	return
}
Beispiel #9
0
func parsefragment(document xml.Document, node *xml.XmlNode, content, url []byte, options xml.ParseOption) (fragment *xml.DocumentFragment, err error) {
	//set up pointers before calling the C function
	var contentPtr, urlPtr unsafe.Pointer
	if len(url) > 0 {
		urlPtr = unsafe.Pointer(&url[0])
	}

	var root xml.Node
	if node == nil {
		containBody := (bytes.Index(content, bodySigBytes) >= 0)

		content = append(fragmentWrapper, content...)
		contentPtr = unsafe.Pointer(&content[0])
		contentLen := len(content)

		inEncoding := document.InputEncoding()
		var encodingPtr unsafe.Pointer
		if len(inEncoding) > 0 {
			encodingPtr = unsafe.Pointer(&inEncoding[0])
		}
		htmlPtr := C.htmlParseFragmentAsDoc(document.DocPtr(), contentPtr, C.int(contentLen), urlPtr, encodingPtr, C.int(options), nil, 0)

		//Note we've parsed the fragment within the given document
		//the root is not the root of the document; rather it's the root of the subtree from the fragment
		html := xml.NewNode(unsafe.Pointer(htmlPtr), document)

		if html == nil {
			err = ErrFailParseFragment
			return
		}
		root = html

		if !containBody {
			root = html.FirstChild()
			html.AddPreviousSibling(root)
			html.Remove() //remove html otherwise it's leaked
		}
	} else {
		//wrap the content
		newContent := append(fragmentWrapperStart, content...)
		newContent = append(newContent, fragmentWrapperEnd...)
		contentPtr = unsafe.Pointer(&newContent[0])
		contentLen := len(newContent)
		rootElementPtr := C.htmlParseFragment(node.NodePtr(), contentPtr, C.int(contentLen), urlPtr, C.int(options), nil, 0)
		if rootElementPtr == nil {
			//try to parse it as a doc
			fragment, err = parsefragment(document, nil, content, url, options)
			return
		}
		if rootElementPtr == nil {
			err = ErrFailParseFragment
			return
		}
		root = xml.NewNode(unsafe.Pointer(rootElementPtr), document)
	}

	fragment = &xml.DocumentFragment{}
	fragment.Node = root
	fragment.InEncoding = document.InputEncoding()
	fragment.OutEncoding = document.OutputEncoding()

	document.BookkeepFragment(fragment)
	return
}