Пример #1
0
func ParseJD(url string) (interface{}, error) {
	url = safeUrl(url)
	content, code := readContent(url)
	if code != 200 {
		return nil, errors.New(fmt.Sprintf("Error:%d\n", code))
	}

	var computer mgo_models.Computer
	computer.ModelUrl = url
	computer.ScanSite = "jd.com"
	//匹配概览数据列表
	ptnContentInfoList := regexp.MustCompile(`(?s)<ul id="parameter2" class="p-parameter-list">(.*?)</ul>`)
	//匹配概览数据
	ptnContentInfoValue := regexp.MustCompile(`<li title='(.*?)'>(.*?):(.*?)</li>`)

	//匹配详细数据表格
	ptnContentTable := regexp.MustCompile(`(?s)<table cellpadding="0" cellspacing="1" width="100%" border="0" class="Ptable">(.*)</table>`)
	//匹配各个模块信息,如主体、CPU、内存等
	ptnContentSection := regexp.MustCompile(`="2">(.*)</th><tr>(?s:.*?)<th class="tdTitle" colspan`)
	//匹配各个参数信息,如内存4G,硬盘500GB等
	ptnContentRow := regexp.MustCompile(`<td class="tdTitle">(.*)</td><td>(.*)</td>`)

	//匹配概览数据列
	matchList := ptnContentInfoList.FindStringSubmatch(content)
	if matchList != nil {
		listContent := matchList[1]
		matcheSections := ptnContentInfoValue.FindAllStringSubmatch(listContent, -1)
		for _, section := range matcheSections {
			switch strings.TrimSpace(strings.TrimSpace(section[2])) {
			case "商品名称":
				computer.Name = section[1]
			case "品牌":
				computer.Brand = section[1]
			case "触摸":
				computer.ScreenType = section[1]
			case "电脑类别":
				computer.Type = section[1]
			case "商品编号":
				computer.Code = fmt.Sprintf("%02d%s", SITECODE_JD, section[1])
				computer.SKU = section[1]
			case "显示器尺寸":
				computer.ScreenSize = section[1]
			case "显卡":
				computer.GraphicsType = section[1]
			}
		}
	}
	//匹配表格
	matchTable := ptnContentTable.FindStringSubmatch(content)
	if matchTable != nil {
		//加这个后缀
		tableContent := matchTable[1] + "<th class=\"tdTitle\" colspan"
		matcheSections := ptnContentSection.FindAllStringSubmatch(tableContent, -1)
		for _, section := range matcheSections {
			matchRows := ptnContentRow.FindAllStringSubmatch(section[0], -1)
			switch strings.TrimSpace(section[1]) {
			case "操作系统":
				for _, row := range matchRows {
					switch strings.TrimSpace(row[1]) {
					case "操作系统", "OS":
						computer.OS = row[2]
					}
				}

			case "主体":
				for _, row := range matchRows {
					switch strings.TrimSpace(row[1]) {
					case "系列":
						computer.Series = row[2]
					case "型号":
						computer.Model = row[2]
					case "平台":
						computer.Platform = row[2]
					case "操作系统":
						computer.OS = row[2]
					}
				}
			case "CPU", "处理器":
				for _, row := range matchRows {
					switch strings.TrimSpace(row[1]) {
					case "类型", "CPU类型":
						computer.CPUBrand = row[2]
					case "型号", "CPU型号":
						computer.CPUModel = row[2]
					case "速度", "CPU速度":
						computer.CPUFrequency = row[2]
					case "核心数", "核心":
						computer.CPUCoreNum = row[2]
					case "二级缓存":
						computer.CPUSecondCache = row[2]
					case "三级缓存":
						computer.CPUThirdCache = row[2]
					}
				}
			case "显卡":
				for _, row := range matchRows {
					switch strings.TrimSpace(row[1]) {
					case "品牌", "显卡品牌":
						computer.GraphicsBrand = row[2]
					case "芯片", "显卡芯片", "显示芯片":
						computer.GraphicsModel = row[2]
					case "显卡容量", "显存容量":
						computer.GraphicsMemorySize = row[2]
					case "显存规格":
						computer.GraphicsMemoryType = row[2]
					}
				}
			case "主板":
				for _, row := range matchRows {
					switch strings.TrimSpace(row[1]) {
					case "芯片组":
						//none
					case "显卡类型":
						computer.GraphicsType = row[2]
					case "声卡":
						computer.AudioCard = row[2]
					case "网卡":
						computer.WebCard = row[2]
					}
				}
			case "内存":
				for _, row := range matchRows {
					switch strings.TrimSpace(row[1]) {
					case "内存容量", "容量":
						computer.MemorySize = row[2]
					case "速度", "内存类型":
						computer.MemoryType = row[2]
					}
				}
			case "硬盘":
				for _, row := range matchRows {
					switch strings.TrimSpace(row[1]) {
					case "容量", "硬盘容量":
						computer.DiskSize = row[2]
					case "类型", "硬盘类型":
						computer.DiskType = row[2]
					case "转速", "硬盘转速":
						computer.DiskSpeed = row[2]
					}
				}
			case "光驱":
				for _, row := range matchRows {
					switch strings.TrimSpace(row[1]) {
					case "类型", "光驱类型":
						computer.CDRom = row[2]
					}
				}
			case "输入设备":
				// None
			case "规格", "机器规格", "电源":
				for _, row := range matchRows {
					switch strings.TrimSpace(row[1]) {
					case "电源", "电池":
						computer.Power = row[2]
					case "电源功率":
						computer.Power += " " + row[2]
					case "尺寸":
						computer.Size = row[2]
					case "重量", "净重":
						computer.Weight = row[2]
					}
				}
			case "特性":
				for _, row := range matchRows {
					switch strings.TrimSpace(row[1]) {
					case "特性":
						computer.Feature = row[2]
					}
				}
			}
		}
	}

	if matchList == nil && matchTable == nil {
		return nil, errors.New("无法从该页面获取信息")
	}

	computer.ID = bson.NewObjectId()
	computer.IDStr = computer.ID.Hex()

	//获取价格
	urlGetPrice := fmt.Sprintf("http://p.3.cn/prices/get?skuid=J_%s", computer.SKU)
	if c, code := readContent(urlGetPrice); code == 200 {
		ptnGetPrice := regexp.MustCompile(`"p":"(.*)","m":"(.*)"`)
		matchPrice := ptnGetPrice.FindStringSubmatch(c)
		if matchPrice != nil {
			temp, _ := strconv.ParseFloat(matchPrice[1], 32)
			computer.JDPrice = float32(temp)
			temp, _ = strconv.ParseFloat(matchPrice[2], 32)
			computer.Price = float32(temp)
		}
	}

	return computer, nil
}