maketables.go 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. // Normalization table generator.
  6. // Data read from the web.
  7. // See forminfo.go for a description of the trie values associated with each rune.
  8. package main
  9. import (
  10. "bytes"
  11. "flag"
  12. "fmt"
  13. "io"
  14. "log"
  15. "sort"
  16. "strconv"
  17. "strings"
  18. "golang.org/x/text/internal/gen"
  19. "golang.org/x/text/internal/triegen"
  20. "golang.org/x/text/internal/ucd"
  21. )
  22. func main() {
  23. gen.Init()
  24. loadUnicodeData()
  25. compactCCC()
  26. loadCompositionExclusions()
  27. completeCharFields(FCanonical)
  28. completeCharFields(FCompatibility)
  29. computeNonStarterCounts()
  30. verifyComputed()
  31. printChars()
  32. testDerived()
  33. printTestdata()
  34. makeTables()
  35. }
  36. var (
  37. tablelist = flag.String("tables",
  38. "all",
  39. "comma-separated list of which tables to generate; "+
  40. "can be 'decomp', 'recomp', 'info' and 'all'")
  41. test = flag.Bool("test",
  42. false,
  43. "test existing tables against DerivedNormalizationProps and generate test data for regression testing")
  44. verbose = flag.Bool("verbose",
  45. false,
  46. "write data to stdout as it is parsed")
  47. )
  48. const MaxChar = 0x10FFFF // anything above this shouldn't exist
  49. // Quick Check properties of runes allow us to quickly
  50. // determine whether a rune may occur in a normal form.
  51. // For a given normal form, a rune may be guaranteed to occur
  52. // verbatim (QC=Yes), may or may not combine with another
  53. // rune (QC=Maybe), or may not occur (QC=No).
  54. type QCResult int
  55. const (
  56. QCUnknown QCResult = iota
  57. QCYes
  58. QCNo
  59. QCMaybe
  60. )
  61. func (r QCResult) String() string {
  62. switch r {
  63. case QCYes:
  64. return "Yes"
  65. case QCNo:
  66. return "No"
  67. case QCMaybe:
  68. return "Maybe"
  69. }
  70. return "***UNKNOWN***"
  71. }
  72. const (
  73. FCanonical = iota // NFC or NFD
  74. FCompatibility // NFKC or NFKD
  75. FNumberOfFormTypes
  76. )
  77. const (
  78. MComposed = iota // NFC or NFKC
  79. MDecomposed // NFD or NFKD
  80. MNumberOfModes
  81. )
  82. // This contains only the properties we're interested in.
  83. type Char struct {
  84. name string
  85. codePoint rune // if zero, this index is not a valid code point.
  86. ccc uint8 // canonical combining class
  87. origCCC uint8
  88. excludeInComp bool // from CompositionExclusions.txt
  89. compatDecomp bool // it has a compatibility expansion
  90. nTrailingNonStarters uint8
  91. nLeadingNonStarters uint8 // must be equal to trailing if non-zero
  92. forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
  93. state State
  94. }
  95. var chars = make([]Char, MaxChar+1)
  96. var cccMap = make(map[uint8]uint8)
  97. func (c Char) String() string {
  98. buf := new(bytes.Buffer)
  99. fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name)
  100. fmt.Fprintf(buf, " ccc: %v\n", c.ccc)
  101. fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp)
  102. fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp)
  103. fmt.Fprintf(buf, " state: %v\n", c.state)
  104. fmt.Fprintf(buf, " NFC:\n")
  105. fmt.Fprint(buf, c.forms[FCanonical])
  106. fmt.Fprintf(buf, " NFKC:\n")
  107. fmt.Fprint(buf, c.forms[FCompatibility])
  108. return buf.String()
  109. }
  110. // In UnicodeData.txt, some ranges are marked like this:
  111. // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  112. // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  113. // parseCharacter keeps a state variable indicating the weirdness.
  114. type State int
  115. const (
  116. SNormal State = iota // known to be zero for the type
  117. SFirst
  118. SLast
  119. SMissing
  120. )
  121. var lastChar = rune('\u0000')
  122. func (c Char) isValid() bool {
  123. return c.codePoint != 0 && c.state != SMissing
  124. }
  125. type FormInfo struct {
  126. quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
  127. verified [MNumberOfModes]bool // index: MComposed or MDecomposed
  128. combinesForward bool // May combine with rune on the right
  129. combinesBackward bool // May combine with rune on the left
  130. isOneWay bool // Never appears in result
  131. inDecomp bool // Some decompositions result in this char.
  132. decomp Decomposition
  133. expandedDecomp Decomposition
  134. }
  135. func (f FormInfo) String() string {
  136. buf := bytes.NewBuffer(make([]byte, 0))
  137. fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed])
  138. fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed])
  139. fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward)
  140. fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward)
  141. fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay)
  142. fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp)
  143. fmt.Fprintf(buf, " decomposition: %X\n", f.decomp)
  144. fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp)
  145. return buf.String()
  146. }
  147. type Decomposition []rune
  148. func parseDecomposition(s string, skipfirst bool) (a []rune, err error) {
  149. decomp := strings.Split(s, " ")
  150. if len(decomp) > 0 && skipfirst {
  151. decomp = decomp[1:]
  152. }
  153. for _, d := range decomp {
  154. point, err := strconv.ParseUint(d, 16, 64)
  155. if err != nil {
  156. return a, err
  157. }
  158. a = append(a, rune(point))
  159. }
  160. return a, nil
  161. }
  162. func loadUnicodeData() {
  163. f := gen.OpenUCDFile("UnicodeData.txt")
  164. defer f.Close()
  165. p := ucd.New(f)
  166. for p.Next() {
  167. r := p.Rune(ucd.CodePoint)
  168. char := &chars[r]
  169. char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass))
  170. decmap := p.String(ucd.DecompMapping)
  171. exp, err := parseDecomposition(decmap, false)
  172. isCompat := false
  173. if err != nil {
  174. if len(decmap) > 0 {
  175. exp, err = parseDecomposition(decmap, true)
  176. if err != nil {
  177. log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err)
  178. }
  179. isCompat = true
  180. }
  181. }
  182. char.name = p.String(ucd.Name)
  183. char.codePoint = r
  184. char.forms[FCompatibility].decomp = exp
  185. if !isCompat {
  186. char.forms[FCanonical].decomp = exp
  187. } else {
  188. char.compatDecomp = true
  189. }
  190. if len(decmap) > 0 {
  191. char.forms[FCompatibility].decomp = exp
  192. }
  193. }
  194. if err := p.Err(); err != nil {
  195. log.Fatal(err)
  196. }
  197. }
  198. // compactCCC converts the sparse set of CCC values to a continguous one,
  199. // reducing the number of bits needed from 8 to 6.
  200. func compactCCC() {
  201. m := make(map[uint8]uint8)
  202. for i := range chars {
  203. c := &chars[i]
  204. m[c.ccc] = 0
  205. }
  206. cccs := []int{}
  207. for v, _ := range m {
  208. cccs = append(cccs, int(v))
  209. }
  210. sort.Ints(cccs)
  211. for i, c := range cccs {
  212. cccMap[uint8(i)] = uint8(c)
  213. m[uint8(c)] = uint8(i)
  214. }
  215. for i := range chars {
  216. c := &chars[i]
  217. c.origCCC = c.ccc
  218. c.ccc = m[c.ccc]
  219. }
  220. if len(m) >= 1<<6 {
  221. log.Fatalf("too many difference CCC values: %d >= 64", len(m))
  222. }
  223. }
  224. // CompositionExclusions.txt has form:
  225. // 0958 # ...
  226. // See http://unicode.org/reports/tr44/ for full explanation
  227. func loadCompositionExclusions() {
  228. f := gen.OpenUCDFile("CompositionExclusions.txt")
  229. defer f.Close()
  230. p := ucd.New(f)
  231. for p.Next() {
  232. c := &chars[p.Rune(0)]
  233. if c.excludeInComp {
  234. log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint)
  235. }
  236. c.excludeInComp = true
  237. }
  238. if e := p.Err(); e != nil {
  239. log.Fatal(e)
  240. }
  241. }
  242. // hasCompatDecomp returns true if any of the recursive
  243. // decompositions contains a compatibility expansion.
  244. // In this case, the character may not occur in NFK*.
  245. func hasCompatDecomp(r rune) bool {
  246. c := &chars[r]
  247. if c.compatDecomp {
  248. return true
  249. }
  250. for _, d := range c.forms[FCompatibility].decomp {
  251. if hasCompatDecomp(d) {
  252. return true
  253. }
  254. }
  255. return false
  256. }
  257. // Hangul related constants.
  258. const (
  259. HangulBase = 0xAC00
  260. HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
  261. JamoLBase = 0x1100
  262. JamoLEnd = 0x1113
  263. JamoVBase = 0x1161
  264. JamoVEnd = 0x1176
  265. JamoTBase = 0x11A8
  266. JamoTEnd = 0x11C3
  267. JamoLVTCount = 19 * 21 * 28
  268. JamoTCount = 28
  269. )
  270. func isHangul(r rune) bool {
  271. return HangulBase <= r && r < HangulEnd
  272. }
  273. func isHangulWithoutJamoT(r rune) bool {
  274. if !isHangul(r) {
  275. return false
  276. }
  277. r -= HangulBase
  278. return r < JamoLVTCount && r%JamoTCount == 0
  279. }
  280. func ccc(r rune) uint8 {
  281. return chars[r].ccc
  282. }
  283. // Insert a rune in a buffer, ordered by Canonical Combining Class.
  284. func insertOrdered(b Decomposition, r rune) Decomposition {
  285. n := len(b)
  286. b = append(b, 0)
  287. cc := ccc(r)
  288. if cc > 0 {
  289. // Use bubble sort.
  290. for ; n > 0; n-- {
  291. if ccc(b[n-1]) <= cc {
  292. break
  293. }
  294. b[n] = b[n-1]
  295. }
  296. }
  297. b[n] = r
  298. return b
  299. }
  300. // Recursively decompose.
  301. func decomposeRecursive(form int, r rune, d Decomposition) Decomposition {
  302. dcomp := chars[r].forms[form].decomp
  303. if len(dcomp) == 0 {
  304. return insertOrdered(d, r)
  305. }
  306. for _, c := range dcomp {
  307. d = decomposeRecursive(form, c, d)
  308. }
  309. return d
  310. }
  311. func completeCharFields(form int) {
  312. // Phase 0: pre-expand decomposition.
  313. for i := range chars {
  314. f := &chars[i].forms[form]
  315. if len(f.decomp) == 0 {
  316. continue
  317. }
  318. exp := make(Decomposition, 0)
  319. for _, c := range f.decomp {
  320. exp = decomposeRecursive(form, c, exp)
  321. }
  322. f.expandedDecomp = exp
  323. }
  324. // Phase 1: composition exclusion, mark decomposition.
  325. for i := range chars {
  326. c := &chars[i]
  327. f := &c.forms[form]
  328. // Marks script-specific exclusions and version restricted.
  329. f.isOneWay = c.excludeInComp
  330. // Singletons
  331. f.isOneWay = f.isOneWay || len(f.decomp) == 1
  332. // Non-starter decompositions
  333. if len(f.decomp) > 1 {
  334. chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0
  335. f.isOneWay = f.isOneWay || chk
  336. }
  337. // Runes that decompose into more than two runes.
  338. f.isOneWay = f.isOneWay || len(f.decomp) > 2
  339. if form == FCompatibility {
  340. f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint)
  341. }
  342. for _, r := range f.decomp {
  343. chars[r].forms[form].inDecomp = true
  344. }
  345. }
  346. // Phase 2: forward and backward combining.
  347. for i := range chars {
  348. c := &chars[i]
  349. f := &c.forms[form]
  350. if !f.isOneWay && len(f.decomp) == 2 {
  351. f0 := &chars[f.decomp[0]].forms[form]
  352. f1 := &chars[f.decomp[1]].forms[form]
  353. if !f0.isOneWay {
  354. f0.combinesForward = true
  355. }
  356. if !f1.isOneWay {
  357. f1.combinesBackward = true
  358. }
  359. }
  360. if isHangulWithoutJamoT(rune(i)) {
  361. f.combinesForward = true
  362. }
  363. }
  364. // Phase 3: quick check values.
  365. for i := range chars {
  366. c := &chars[i]
  367. f := &c.forms[form]
  368. switch {
  369. case len(f.decomp) > 0:
  370. f.quickCheck[MDecomposed] = QCNo
  371. case isHangul(rune(i)):
  372. f.quickCheck[MDecomposed] = QCNo
  373. default:
  374. f.quickCheck[MDecomposed] = QCYes
  375. }
  376. switch {
  377. case f.isOneWay:
  378. f.quickCheck[MComposed] = QCNo
  379. case (i & 0xffff00) == JamoLBase:
  380. f.quickCheck[MComposed] = QCYes
  381. if JamoLBase <= i && i < JamoLEnd {
  382. f.combinesForward = true
  383. }
  384. if JamoVBase <= i && i < JamoVEnd {
  385. f.quickCheck[MComposed] = QCMaybe
  386. f.combinesBackward = true
  387. f.combinesForward = true
  388. }
  389. if JamoTBase <= i && i < JamoTEnd {
  390. f.quickCheck[MComposed] = QCMaybe
  391. f.combinesBackward = true
  392. }
  393. case !f.combinesBackward:
  394. f.quickCheck[MComposed] = QCYes
  395. default:
  396. f.quickCheck[MComposed] = QCMaybe
  397. }
  398. }
  399. }
  400. func computeNonStarterCounts() {
  401. // Phase 4: leading and trailing non-starter count
  402. for i := range chars {
  403. c := &chars[i]
  404. runes := []rune{rune(i)}
  405. // We always use FCompatibility so that the CGJ insertion points do not
  406. // change for repeated normalizations with different forms.
  407. if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 {
  408. runes = exp
  409. }
  410. // We consider runes that combine backwards to be non-starters for the
  411. // purpose of Stream-Safe Text Processing.
  412. for _, r := range runes {
  413. if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
  414. break
  415. }
  416. c.nLeadingNonStarters++
  417. }
  418. for i := len(runes) - 1; i >= 0; i-- {
  419. if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
  420. break
  421. }
  422. c.nTrailingNonStarters++
  423. }
  424. if c.nTrailingNonStarters > 3 {
  425. log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes)
  426. }
  427. if isHangul(rune(i)) {
  428. c.nTrailingNonStarters = 2
  429. if isHangulWithoutJamoT(rune(i)) {
  430. c.nTrailingNonStarters = 1
  431. }
  432. }
  433. if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t {
  434. log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t)
  435. }
  436. if t := c.nTrailingNonStarters; t > 3 {
  437. log.Fatalf("%U: number of trailing non-starters is %d > 3", t)
  438. }
  439. }
  440. }
  441. func printBytes(w io.Writer, b []byte, name string) {
  442. fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b))
  443. fmt.Fprintf(w, "var %s = [...]byte {", name)
  444. for i, c := range b {
  445. switch {
  446. case i%64 == 0:
  447. fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63)
  448. case i%8 == 0:
  449. fmt.Fprintf(w, "\n")
  450. }
  451. fmt.Fprintf(w, "0x%.2X, ", c)
  452. }
  453. fmt.Fprint(w, "\n}\n\n")
  454. }
  455. // See forminfo.go for format.
  456. func makeEntry(f *FormInfo, c *Char) uint16 {
  457. e := uint16(0)
  458. if r := c.codePoint; HangulBase <= r && r < HangulEnd {
  459. e |= 0x40
  460. }
  461. if f.combinesForward {
  462. e |= 0x20
  463. }
  464. if f.quickCheck[MDecomposed] == QCNo {
  465. e |= 0x4
  466. }
  467. switch f.quickCheck[MComposed] {
  468. case QCYes:
  469. case QCNo:
  470. e |= 0x10
  471. case QCMaybe:
  472. e |= 0x18
  473. default:
  474. log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed])
  475. }
  476. e |= uint16(c.nTrailingNonStarters)
  477. return e
  478. }
  479. // decompSet keeps track of unique decompositions, grouped by whether
  480. // the decomposition is followed by a trailing and/or leading CCC.
  481. type decompSet [7]map[string]bool
  482. const (
  483. normalDecomp = iota
  484. firstMulti
  485. firstCCC
  486. endMulti
  487. firstLeadingCCC
  488. firstCCCZeroExcept
  489. firstStarterWithNLead
  490. lastDecomp
  491. )
  492. var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"}
  493. func makeDecompSet() decompSet {
  494. m := decompSet{}
  495. for i := range m {
  496. m[i] = make(map[string]bool)
  497. }
  498. return m
  499. }
  500. func (m *decompSet) insert(key int, s string) {
  501. m[key][s] = true
  502. }
  503. func printCharInfoTables(w io.Writer) int {
  504. mkstr := func(r rune, f *FormInfo) (int, string) {
  505. d := f.expandedDecomp
  506. s := string([]rune(d))
  507. if max := 1 << 6; len(s) >= max {
  508. const msg = "%U: too many bytes in decomposition: %d >= %d"
  509. log.Fatalf(msg, r, len(s), max)
  510. }
  511. head := uint8(len(s))
  512. if f.quickCheck[MComposed] != QCYes {
  513. head |= 0x40
  514. }
  515. if f.combinesForward {
  516. head |= 0x80
  517. }
  518. s = string([]byte{head}) + s
  519. lccc := ccc(d[0])
  520. tccc := ccc(d[len(d)-1])
  521. cc := ccc(r)
  522. if cc != 0 && lccc == 0 && tccc == 0 {
  523. log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc)
  524. }
  525. if tccc < lccc && lccc != 0 {
  526. const msg = "%U: lccc (%d) must be <= tcc (%d)"
  527. log.Fatalf(msg, r, lccc, tccc)
  528. }
  529. index := normalDecomp
  530. nTrail := chars[r].nTrailingNonStarters
  531. nLead := chars[r].nLeadingNonStarters
  532. if tccc > 0 || lccc > 0 || nTrail > 0 {
  533. tccc <<= 2
  534. tccc |= nTrail
  535. s += string([]byte{tccc})
  536. index = endMulti
  537. for _, r := range d[1:] {
  538. if ccc(r) == 0 {
  539. index = firstCCC
  540. }
  541. }
  542. if lccc > 0 || nLead > 0 {
  543. s += string([]byte{lccc})
  544. if index == firstCCC {
  545. log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
  546. }
  547. index = firstLeadingCCC
  548. }
  549. if cc != lccc {
  550. if cc != 0 {
  551. log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
  552. }
  553. index = firstCCCZeroExcept
  554. }
  555. } else if len(d) > 1 {
  556. index = firstMulti
  557. }
  558. return index, s
  559. }
  560. decompSet := makeDecompSet()
  561. const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail.
  562. decompSet.insert(firstStarterWithNLead, nLeadStr)
  563. // Store the uniqued decompositions in a byte buffer,
  564. // preceded by their byte length.
  565. for _, c := range chars {
  566. for _, f := range c.forms {
  567. if len(f.expandedDecomp) == 0 {
  568. continue
  569. }
  570. if f.combinesBackward {
  571. log.Fatalf("%U: combinesBackward and decompose", c.codePoint)
  572. }
  573. index, s := mkstr(c.codePoint, &f)
  574. decompSet.insert(index, s)
  575. }
  576. }
  577. decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
  578. size := 0
  579. positionMap := make(map[string]uint16)
  580. decompositions.WriteString("\000")
  581. fmt.Fprintln(w, "const (")
  582. for i, m := range decompSet {
  583. sa := []string{}
  584. for s := range m {
  585. sa = append(sa, s)
  586. }
  587. sort.Strings(sa)
  588. for _, s := range sa {
  589. p := decompositions.Len()
  590. decompositions.WriteString(s)
  591. positionMap[s] = uint16(p)
  592. }
  593. if cname[i] != "" {
  594. fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len())
  595. }
  596. }
  597. fmt.Fprintln(w, "maxDecomp = 0x8000")
  598. fmt.Fprintln(w, ")")
  599. b := decompositions.Bytes()
  600. printBytes(w, b, "decomps")
  601. size += len(b)
  602. varnames := []string{"nfc", "nfkc"}
  603. for i := 0; i < FNumberOfFormTypes; i++ {
  604. trie := triegen.NewTrie(varnames[i])
  605. for r, c := range chars {
  606. f := c.forms[i]
  607. d := f.expandedDecomp
  608. if len(d) != 0 {
  609. _, key := mkstr(c.codePoint, &f)
  610. trie.Insert(rune(r), uint64(positionMap[key]))
  611. if c.ccc != ccc(d[0]) {
  612. // We assume the lead ccc of a decomposition !=0 in this case.
  613. if ccc(d[0]) == 0 {
  614. log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
  615. }
  616. }
  617. } else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward {
  618. // Handle cases where it can't be detected that the nLead should be equal
  619. // to nTrail.
  620. trie.Insert(c.codePoint, uint64(positionMap[nLeadStr]))
  621. } else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 {
  622. trie.Insert(c.codePoint, uint64(0x8000|v))
  623. }
  624. }
  625. sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]}))
  626. if err != nil {
  627. log.Fatal(err)
  628. }
  629. size += sz
  630. }
  631. return size
  632. }
  633. func contains(sa []string, s string) bool {
  634. for _, a := range sa {
  635. if a == s {
  636. return true
  637. }
  638. }
  639. return false
  640. }
  641. func makeTables() {
  642. w := &bytes.Buffer{}
  643. size := 0
  644. if *tablelist == "" {
  645. return
  646. }
  647. list := strings.Split(*tablelist, ",")
  648. if *tablelist == "all" {
  649. list = []string{"recomp", "info"}
  650. }
  651. // Compute maximum decomposition size.
  652. max := 0
  653. for _, c := range chars {
  654. if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max {
  655. max = n
  656. }
  657. }
  658. fmt.Fprintln(w, "const (")
  659. fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.")
  660. fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion())
  661. fmt.Fprintln(w)
  662. fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform")
  663. fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at")
  664. fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that")
  665. fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.")
  666. fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max)
  667. fmt.Fprintln(w, ")\n")
  668. // Print the CCC remap table.
  669. size += len(cccMap)
  670. fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap))
  671. for i := 0; i < len(cccMap); i++ {
  672. if i%8 == 0 {
  673. fmt.Fprintln(w)
  674. }
  675. fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)])
  676. }
  677. fmt.Fprintln(w, "\n}\n")
  678. if contains(list, "info") {
  679. size += printCharInfoTables(w)
  680. }
  681. if contains(list, "recomp") {
  682. // Note that we use 32 bit keys, instead of 64 bit.
  683. // This clips the bits of three entries, but we know
  684. // this won't cause a collision. The compiler will catch
  685. // any changes made to UnicodeData.txt that introduces
  686. // a collision.
  687. // Note that the recomposition map for NFC and NFKC
  688. // are identical.
  689. // Recomposition map
  690. nrentries := 0
  691. for _, c := range chars {
  692. f := c.forms[FCanonical]
  693. if !f.isOneWay && len(f.decomp) > 0 {
  694. nrentries++
  695. }
  696. }
  697. sz := nrentries * 8
  698. size += sz
  699. fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz)
  700. fmt.Fprintln(w, "var recompMap = map[uint32]rune{")
  701. for i, c := range chars {
  702. f := c.forms[FCanonical]
  703. d := f.decomp
  704. if !f.isOneWay && len(d) > 0 {
  705. key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1]))
  706. fmt.Fprintf(w, "0x%.8X: 0x%.4X,\n", key, i)
  707. }
  708. }
  709. fmt.Fprintf(w, "}\n\n")
  710. }
  711. fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
  712. gen.WriteVersionedGoFile("tables.go", "norm", w.Bytes())
  713. }
  714. func printChars() {
  715. if *verbose {
  716. for _, c := range chars {
  717. if !c.isValid() || c.state == SMissing {
  718. continue
  719. }
  720. fmt.Println(c)
  721. }
  722. }
  723. }
  724. // verifyComputed does various consistency tests.
  725. func verifyComputed() {
  726. for i, c := range chars {
  727. for _, f := range c.forms {
  728. isNo := (f.quickCheck[MDecomposed] == QCNo)
  729. if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) {
  730. log.Fatalf("%U: NF*D QC must be No if rune decomposes", i)
  731. }
  732. isMaybe := f.quickCheck[MComposed] == QCMaybe
  733. if f.combinesBackward != isMaybe {
  734. log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i)
  735. }
  736. if len(f.decomp) > 0 && f.combinesForward && isMaybe {
  737. log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i)
  738. }
  739. if len(f.expandedDecomp) != 0 {
  740. continue
  741. }
  742. if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b {
  743. // We accept these runes to be treated differently (it only affects
  744. // segment breaking in iteration, most likely on improper use), but
  745. // reconsider if more characters are added.
  746. // U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;;
  747. // U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;;
  748. // U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;;
  749. // U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;;
  750. // U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;;
  751. // U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;;
  752. if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) {
  753. log.Fatalf("%U: nLead was %v; want %v", i, a, b)
  754. }
  755. }
  756. }
  757. nfc := c.forms[FCanonical]
  758. nfkc := c.forms[FCompatibility]
  759. if nfc.combinesBackward != nfkc.combinesBackward {
  760. log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
  761. }
  762. }
  763. }
  764. // Use values in DerivedNormalizationProps.txt to compare against the
  765. // values we computed.
  766. // DerivedNormalizationProps.txt has form:
  767. // 00C0..00C5 ; NFD_QC; N # ...
  768. // 0374 ; NFD_QC; N # ...
  769. // See http://unicode.org/reports/tr44/ for full explanation
  770. func testDerived() {
  771. f := gen.OpenUCDFile("DerivedNormalizationProps.txt")
  772. defer f.Close()
  773. p := ucd.New(f)
  774. for p.Next() {
  775. r := p.Rune(0)
  776. c := &chars[r]
  777. var ftype, mode int
  778. qt := p.String(1)
  779. switch qt {
  780. case "NFC_QC":
  781. ftype, mode = FCanonical, MComposed
  782. case "NFD_QC":
  783. ftype, mode = FCanonical, MDecomposed
  784. case "NFKC_QC":
  785. ftype, mode = FCompatibility, MComposed
  786. case "NFKD_QC":
  787. ftype, mode = FCompatibility, MDecomposed
  788. default:
  789. continue
  790. }
  791. var qr QCResult
  792. switch p.String(2) {
  793. case "Y":
  794. qr = QCYes
  795. case "N":
  796. qr = QCNo
  797. case "M":
  798. qr = QCMaybe
  799. default:
  800. log.Fatalf(`Unexpected quick check value "%s"`, p.String(2))
  801. }
  802. if got := c.forms[ftype].quickCheck[mode]; got != qr {
  803. log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr)
  804. }
  805. c.forms[ftype].verified[mode] = true
  806. }
  807. if err := p.Err(); err != nil {
  808. log.Fatal(err)
  809. }
  810. // Any unspecified value must be QCYes. Verify this.
  811. for i, c := range chars {
  812. for j, fd := range c.forms {
  813. for k, qr := range fd.quickCheck {
  814. if !fd.verified[k] && qr != QCYes {
  815. m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n"
  816. log.Printf(m, i, j, k, qr, c.name)
  817. }
  818. }
  819. }
  820. }
  821. }
  822. var testHeader = `const (
  823. Yes = iota
  824. No
  825. Maybe
  826. )
  827. type formData struct {
  828. qc uint8
  829. combinesForward bool
  830. decomposition string
  831. }
  832. type runeData struct {
  833. r rune
  834. ccc uint8
  835. nLead uint8
  836. nTrail uint8
  837. f [2]formData // 0: canonical; 1: compatibility
  838. }
  839. func f(qc uint8, cf bool, dec string) [2]formData {
  840. return [2]formData{{qc, cf, dec}, {qc, cf, dec}}
  841. }
  842. func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData {
  843. return [2]formData{{qc, cf, d}, {qck, cfk, dk}}
  844. }
  845. var testData = []runeData{
  846. `
  847. func printTestdata() {
  848. type lastInfo struct {
  849. ccc uint8
  850. nLead uint8
  851. nTrail uint8
  852. f string
  853. }
  854. last := lastInfo{}
  855. w := &bytes.Buffer{}
  856. fmt.Fprintf(w, testHeader)
  857. for r, c := range chars {
  858. f := c.forms[FCanonical]
  859. qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
  860. f = c.forms[FCompatibility]
  861. qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
  862. s := ""
  863. if d == dk && qc == qck && cf == cfk {
  864. s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d)
  865. } else {
  866. s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk)
  867. }
  868. current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s}
  869. if last != current {
  870. fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s)
  871. last = current
  872. }
  873. }
  874. fmt.Fprintln(w, "}")
  875. gen.WriteVersionedGoFile("data_test.go", "norm", w.Bytes())
  876. }