func parsePage(num int, p pdf.Page) (name string, table []Inst) { content := p.Content() var text []pdf.Text for _, t := range content.Text { if match(t, "Times-Roman", 7.2, "") { t.FontSize = 9 } if match(t, "Times-Roman", 6.72, "") && '0' <= t.S[0] && t.S[0] <= '9' { t.S = string([]rune("⁰¹²³⁴⁵⁶⁷⁸⁹")[t.S[0]-'0']) t.FontSize = 9 t.Y -= 2.28 } if t.Font == "Gen_Arial" { continue } text = append(text, t) } text = findWords(text) for i, t := range text { if t.Font == "Times" { t.Font = "Times-Roman" text[i] = t } } if debugPage > 0 { for _, t := range text { fmt.Println(t) } for _, r := range content.Rect { fmt.Println(r) } } // Remove text we should ignore. out := text[:0] skip := false for _, t := range text { // skip page footer if match(t, "Helvetica", 8, "A") || match(t, "Helvetica", 8, "ARM DDI") || match(t, "Helvetica-Oblique", 8, "Copyright") { continue } // skip section header and body text if match(t, "Helvetica-Bold", 12, "") && (sectionRE.MatchString(t.S) || t.S == "Alphabetical list of instructions") { skip = true continue } if skip && match(t, "Times-Roman", 9, "") { continue } skip = false out = append(out, t) } text = out // Page header must say Instruction Details. if len(text) == 0 || !match(text[0], "Helvetica-Oblique", 8, "Instruction Details") && !match(text[0], "Times-Roman", 9, "Instruction Details") { return "", nil } text = text[1:] isSection := func(text []pdf.Text, i int) int { if i+2 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && sectionRE.MatchString(text[i].S) && match(text[i+1], "Helvetica-Bold", 10, "") { return 2 } if i+1 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && childRE.MatchString(text[i].S) { return 1 } return 0 } // Skip dummy headlines and sections. for d := isSection(text, 0); d != 0; d = isSection(text, 0) { i := d for i < len(text) && !match(text[i], "Helvetica-Bold", 9, "Encoding") && !match(text[i], "Helvetica-Bold", 10, "") { i++ } if isSection(text, i) == 0 { break } text = text[i:] } // Next line is headline. Can wrap to multiple lines. d := isSection(text, 0) if d == 0 { if debugPage > 0 { fmt.Printf("non-inst-headline: %v\n", text[0]) } checkNoEncodings(num, text) return "", nil } if d == 2 { name = text[1].S text = text[2:] } else if d == 1 { m := childRE.FindStringSubmatch(text[0].S) name = m[1] text = text[1:] } for len(text) > 0 && match(text[0], "Helvetica-Bold", 10, "") { name += " " + text[0].S text = text[1:] } // Skip description. for len(text) > 0 && (match(text[0], "Times-Roman", 9, "") || match(text[0], "LucidaSansTypewriteX", 6.48, "") || match(text[0], "Times-Bold", 10, "Note")) { text = text[1:] } // Encodings follow. warned := false for i := 0; i < len(text); { if match(text[i], "Helvetica-Bold", 10, "Assembler syntax") || match(text[i], "Helvetica-Bold", 9, "Modified operation in ThumbEE") || match(text[i], "Helvetica-Bold", 9, "Unallocated memory hints") || match(text[i], "Helvetica-Bold", 9, "Related encodings") || match(text[i], "Times-Roman", 9, "Figure A") || match(text[i], "Helvetica-Bold", 9, "Table A") || match(text[i], "Helvetica-Bold", 9, "VFP Instructions") || match(text[i], "Helvetica-Bold", 9, "VFP instructions") || match(text[i], "Helvetica-Bold", 9, "VFP vectors") || match(text[i], "Helvetica-Bold", 9, "FLDMX") || match(text[i], "Helvetica-Bold", 9, "FSTMX") || match(text[i], "Helvetica-Bold", 9, "Advanced SIMD and VFP") { checkNoEncodings(num, text[i:]) break } if match(text[i], "Helvetica-Bold", 9, "Figure A") { y := text[i].Y i++ for i < len(text) && math.Abs(text[i].Y-y) < 2 { i++ } continue } if !match(text[i], "Helvetica-Bold", 9, "Encoding") { if !warned { warned = true fmt.Fprintln(os.Stderr, "page", num, ": unexpected:", text[i]) } i++ continue } inst := Inst{ Name: name, } enc := text[i].S x := text[i].X i++ // Possible subarchitecture notes. for i < len(text) && text[i].X > x+36 { if inst.Arch != "" { inst.Arch += " " } inst.Arch += text[i].S i++ } // Encoding syntaxes. for i < len(text) && (match(text[i], "LucidaSansTypewriteX", 6.48, "") || text[i].X > x+36) { if text[i].X < x+0.25*inch { inst.Syntax = append(inst.Syntax, text[i].S) } else { s := inst.Syntax[len(inst.Syntax)-1] if !strings.Contains(s, "\t") { s += "\t" } else { s += " " } s += text[i].S inst.Syntax[len(inst.Syntax)-1] = s } i++ } var bits, abits, aenc string bits, i = readBitBox(inst.Name, inst.Syntax, content, text, i) if strings.Contains(enc, " / ") { if i < len(text) && match(text[i], "Times-Roman", 8, "") { abits, i = readBitBox(inst.Name, inst.Syntax, content, text, i) } else { abits = bits } slash := strings.Index(enc, " / ") aenc = "Encoding " + enc[slash+len(" / "):] enc = enc[:slash] } // pseudocode y0 := -1 * inch tab := 0.0 for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") { t := text[i] i++ if math.Abs(t.Y-y0) < 3 { // same line as last fragment, probably just two spaces inst.Code += " " + t.S continue } if inst.Code != "" { inst.Code += "\n" } if t.X > x+0.1*inch { if tab == 0 { tab = t.X - x } inst.Code += strings.Repeat("\t", int((t.X-x)/tab+0.5)) } else { tab = 0 } inst.Code += t.S y0 = t.Y } inst.ID = strings.TrimPrefix(enc, "Encoding ") inst.Bits = bits table = append(table, inst) if abits != "" { inst.ID = strings.TrimPrefix(aenc, "Encoding ") inst.Bits = abits table = append(table, inst) } } return name, table }
func parsePage(num int, p pdf.Page) []Inst { content := p.Content() var text []pdf.Text for _, t := range content.Text { text = append(text, t) } text = findWords(text) if debugPage > 0 { for _, t := range text { fmt.Println(t) } for _, r := range content.Rect { fmt.Println(r) } } // Look for instruction encodings. // Some begin with a Helvetica-BoldOblique size 11 headline like "AND X-Form", // is followed by Helvetica 9 mnemonic, and then a bit box with // Helvetica 9 fields and Helvetica 7 bit offsets. // Others use Arial,BoldItalic 11 for the headline, // Arial 8 for the mnemonic, and Arial 4.2 for the bit offsets. var insts []Inst for { // Heading for len(text) > 0 && !match(text[0], "Helvetica-BoldOblique", 11, "") && !match(text[0], "Arial,BoldItalic", 11, "") && !match(text[0], "Arial,BoldItalic", 10, "") { text = text[1:] } if len(text) == 0 { break } heading := text[0].S text = text[1:] for len(text) > 0 && (match(text[0], "Helvetica-BoldOblique", 11, "") || match(text[0], "Arial,BoldItalic", 11, "") || match(text[0], "Arial,BoldItalic", 10, "")) { heading += " " + text[0].S text = text[1:] } heading = strings.Replace(heading, "]", "] ", -1) heading = strings.Replace(heading, " ", " ", -1) heading = strings.Replace(heading, "rEVX-form", "r EVX-form", -1) heading = strings.Replace(heading, "eX-form", "e X-form", -1) heading = strings.Replace(heading, "mSD4-form", "m SD4-form", -1) heading = strings.Replace(heading, "eSCI8-form", "e SCI8-form", -1) heading = strings.TrimSpace(heading) if isVLE(heading) { continue } // Mnemonic if len(text) == 0 || (!match(text[0], "Helvetica", 9, "") && !match(text[0], "Helvetica-BoldOblique", 9, "") && !match(text[0], "Arial", 9, "") && !match(text[0], "Arial", 10, "")) { continue } mnemonic := "" y := text[0].Y x0 := text[0].X for len(text) > 0 && (match(text[0], "Helvetica", 9, "") || match(text[0], "Helvetica-BoldOblique", 9, "") || match(text[0], "Arial", 9, "") || match(text[0], "Courier", 8, "") || match(text[0], "LucidaConsole", 7.17, "") || text[0].Y == y) { if text[0].Y != y { if math.Abs(text[0].X-x0) > 4 { break } mnemonic += "\n" y = text[0].Y } else if mnemonic != "" { mnemonic += " " } mnemonic += text[0].S text = text[1:] } // Encoding bits, i := readBitBox(heading, content, text, num) if i == 0 { continue } insts = append(insts, Inst{heading, mnemonic, bits}) } return insts }