iter.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package norm
  5. import (
  6. "fmt"
  7. "unicode/utf8"
  8. )
  9. // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
  10. // sequence of starter and non-starter runes for the purpose of normalization.
  11. const MaxSegmentSize = maxByteBufferSize
  12. // An Iter iterates over a string or byte slice, while normalizing it
  13. // to a given Form.
  14. type Iter struct {
  15. rb reorderBuffer
  16. buf [maxByteBufferSize]byte
  17. info Properties // first character saved from previous iteration
  18. next iterFunc // implementation of next depends on form
  19. asciiF iterFunc
  20. p int // current position in input source
  21. multiSeg []byte // remainder of multi-segment decomposition
  22. }
  23. type iterFunc func(*Iter) []byte
  24. // Init initializes i to iterate over src after normalizing it to Form f.
  25. func (i *Iter) Init(f Form, src []byte) {
  26. i.p = 0
  27. if len(src) == 0 {
  28. i.setDone()
  29. i.rb.nsrc = 0
  30. return
  31. }
  32. i.multiSeg = nil
  33. i.rb.init(f, src)
  34. i.next = i.rb.f.nextMain
  35. i.asciiF = nextASCIIBytes
  36. i.info = i.rb.f.info(i.rb.src, i.p)
  37. }
  38. // InitString initializes i to iterate over src after normalizing it to Form f.
  39. func (i *Iter) InitString(f Form, src string) {
  40. i.p = 0
  41. if len(src) == 0 {
  42. i.setDone()
  43. i.rb.nsrc = 0
  44. return
  45. }
  46. i.multiSeg = nil
  47. i.rb.initString(f, src)
  48. i.next = i.rb.f.nextMain
  49. i.asciiF = nextASCIIString
  50. i.info = i.rb.f.info(i.rb.src, i.p)
  51. }
  52. // Seek sets the segment to be returned by the next call to Next to start
  53. // at position p. It is the responsibility of the caller to set p to the
  54. // start of a UTF8 rune.
  55. func (i *Iter) Seek(offset int64, whence int) (int64, error) {
  56. var abs int64
  57. switch whence {
  58. case 0:
  59. abs = offset
  60. case 1:
  61. abs = int64(i.p) + offset
  62. case 2:
  63. abs = int64(i.rb.nsrc) + offset
  64. default:
  65. return 0, fmt.Errorf("norm: invalid whence")
  66. }
  67. if abs < 0 {
  68. return 0, fmt.Errorf("norm: negative position")
  69. }
  70. if int(abs) >= i.rb.nsrc {
  71. i.setDone()
  72. return int64(i.p), nil
  73. }
  74. i.p = int(abs)
  75. i.multiSeg = nil
  76. i.next = i.rb.f.nextMain
  77. i.info = i.rb.f.info(i.rb.src, i.p)
  78. return abs, nil
  79. }
  80. // returnSlice returns a slice of the underlying input type as a byte slice.
  81. // If the underlying is of type []byte, it will simply return a slice.
  82. // If the underlying is of type string, it will copy the slice to the buffer
  83. // and return that.
  84. func (i *Iter) returnSlice(a, b int) []byte {
  85. if i.rb.src.bytes == nil {
  86. return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
  87. }
  88. return i.rb.src.bytes[a:b]
  89. }
  90. // Pos returns the byte position at which the next call to Next will commence processing.
  91. func (i *Iter) Pos() int {
  92. return i.p
  93. }
  94. func (i *Iter) setDone() {
  95. i.next = nextDone
  96. i.p = i.rb.nsrc
  97. }
  98. // Done returns true if there is no more input to process.
  99. func (i *Iter) Done() bool {
  100. return i.p >= i.rb.nsrc
  101. }
  102. // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
  103. // For any input a and b for which f(a) == f(b), subsequent calls
  104. // to Next will return the same segments.
  105. // Modifying runes are grouped together with the preceding starter, if such a starter exists.
  106. // Although not guaranteed, n will typically be the smallest possible n.
  107. func (i *Iter) Next() []byte {
  108. return i.next(i)
  109. }
  110. func nextASCIIBytes(i *Iter) []byte {
  111. p := i.p + 1
  112. if p >= i.rb.nsrc {
  113. i.setDone()
  114. return i.rb.src.bytes[i.p:p]
  115. }
  116. if i.rb.src.bytes[p] < utf8.RuneSelf {
  117. p0 := i.p
  118. i.p = p
  119. return i.rb.src.bytes[p0:p]
  120. }
  121. i.info = i.rb.f.info(i.rb.src, i.p)
  122. i.next = i.rb.f.nextMain
  123. return i.next(i)
  124. }
  125. func nextASCIIString(i *Iter) []byte {
  126. p := i.p + 1
  127. if p >= i.rb.nsrc {
  128. i.buf[0] = i.rb.src.str[i.p]
  129. i.setDone()
  130. return i.buf[:1]
  131. }
  132. if i.rb.src.str[p] < utf8.RuneSelf {
  133. i.buf[0] = i.rb.src.str[i.p]
  134. i.p = p
  135. return i.buf[:1]
  136. }
  137. i.info = i.rb.f.info(i.rb.src, i.p)
  138. i.next = i.rb.f.nextMain
  139. return i.next(i)
  140. }
  141. func nextHangul(i *Iter) []byte {
  142. p := i.p
  143. next := p + hangulUTF8Size
  144. if next >= i.rb.nsrc {
  145. i.setDone()
  146. } else if i.rb.src.hangul(next) == 0 {
  147. i.info = i.rb.f.info(i.rb.src, i.p)
  148. i.next = i.rb.f.nextMain
  149. return i.next(i)
  150. }
  151. i.p = next
  152. return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
  153. }
  154. func nextDone(i *Iter) []byte {
  155. return nil
  156. }
  157. // nextMulti is used for iterating over multi-segment decompositions
  158. // for decomposing normal forms.
  159. func nextMulti(i *Iter) []byte {
  160. j := 0
  161. d := i.multiSeg
  162. // skip first rune
  163. for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
  164. }
  165. for j < len(d) {
  166. info := i.rb.f.info(input{bytes: d}, j)
  167. if info.BoundaryBefore() {
  168. i.multiSeg = d[j:]
  169. return d[:j]
  170. }
  171. j += int(info.size)
  172. }
  173. // treat last segment as normal decomposition
  174. i.next = i.rb.f.nextMain
  175. return i.next(i)
  176. }
  177. // nextMultiNorm is used for iterating over multi-segment decompositions
  178. // for composing normal forms.
  179. func nextMultiNorm(i *Iter) []byte {
  180. j := 0
  181. d := i.multiSeg
  182. for j < len(d) {
  183. info := i.rb.f.info(input{bytes: d}, j)
  184. if info.BoundaryBefore() {
  185. i.rb.compose()
  186. seg := i.buf[:i.rb.flushCopy(i.buf[:])]
  187. i.rb.ss.first(info)
  188. i.rb.insertUnsafe(input{bytes: d}, j, info)
  189. i.multiSeg = d[j+int(info.size):]
  190. return seg
  191. }
  192. i.rb.ss.next(info)
  193. i.rb.insertUnsafe(input{bytes: d}, j, info)
  194. j += int(info.size)
  195. }
  196. i.multiSeg = nil
  197. i.next = nextComposed
  198. return doNormComposed(i)
  199. }
  200. // nextDecomposed is the implementation of Next for forms NFD and NFKD.
  201. func nextDecomposed(i *Iter) (next []byte) {
  202. outp := 0
  203. inCopyStart, outCopyStart := i.p, 0
  204. ss := mkStreamSafe(i.info)
  205. for {
  206. if sz := int(i.info.size); sz <= 1 {
  207. p := i.p
  208. i.p++ // ASCII or illegal byte. Either way, advance by 1.
  209. if i.p >= i.rb.nsrc {
  210. i.setDone()
  211. return i.returnSlice(p, i.p)
  212. } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
  213. i.next = i.asciiF
  214. return i.returnSlice(p, i.p)
  215. }
  216. outp++
  217. } else if d := i.info.Decomposition(); d != nil {
  218. // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
  219. // Case 1: there is a leftover to copy. In this case the decomposition
  220. // must begin with a modifier and should always be appended.
  221. // Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
  222. p := outp + len(d)
  223. if outp > 0 {
  224. i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
  225. if p > len(i.buf) {
  226. return i.buf[:outp]
  227. }
  228. } else if i.info.multiSegment() {
  229. // outp must be 0 as multi-segment decompositions always
  230. // start a new segment.
  231. if i.multiSeg == nil {
  232. i.multiSeg = d
  233. i.next = nextMulti
  234. return nextMulti(i)
  235. }
  236. // We are in the last segment. Treat as normal decomposition.
  237. d = i.multiSeg
  238. i.multiSeg = nil
  239. p = len(d)
  240. }
  241. prevCC := i.info.tccc
  242. if i.p += sz; i.p >= i.rb.nsrc {
  243. i.setDone()
  244. i.info = Properties{} // Force BoundaryBefore to succeed.
  245. } else {
  246. i.info = i.rb.f.info(i.rb.src, i.p)
  247. }
  248. switch ss.next(i.info) {
  249. case ssOverflow:
  250. i.next = nextCGJDecompose
  251. fallthrough
  252. case ssStarter:
  253. if outp > 0 {
  254. copy(i.buf[outp:], d)
  255. return i.buf[:p]
  256. }
  257. return d
  258. }
  259. copy(i.buf[outp:], d)
  260. outp = p
  261. inCopyStart, outCopyStart = i.p, outp
  262. if i.info.ccc < prevCC {
  263. goto doNorm
  264. }
  265. continue
  266. } else if r := i.rb.src.hangul(i.p); r != 0 {
  267. outp = decomposeHangul(i.buf[:], r)
  268. i.p += hangulUTF8Size
  269. inCopyStart, outCopyStart = i.p, outp
  270. if i.p >= i.rb.nsrc {
  271. i.setDone()
  272. break
  273. } else if i.rb.src.hangul(i.p) != 0 {
  274. i.next = nextHangul
  275. return i.buf[:outp]
  276. }
  277. } else {
  278. p := outp + sz
  279. if p > len(i.buf) {
  280. break
  281. }
  282. outp = p
  283. i.p += sz
  284. }
  285. if i.p >= i.rb.nsrc {
  286. i.setDone()
  287. break
  288. }
  289. prevCC := i.info.tccc
  290. i.info = i.rb.f.info(i.rb.src, i.p)
  291. if v := ss.next(i.info); v == ssStarter {
  292. break
  293. } else if v == ssOverflow {
  294. i.next = nextCGJDecompose
  295. break
  296. }
  297. if i.info.ccc < prevCC {
  298. goto doNorm
  299. }
  300. }
  301. if outCopyStart == 0 {
  302. return i.returnSlice(inCopyStart, i.p)
  303. } else if inCopyStart < i.p {
  304. i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
  305. }
  306. return i.buf[:outp]
  307. doNorm:
  308. // Insert what we have decomposed so far in the reorderBuffer.
  309. // As we will only reorder, there will always be enough room.
  310. i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
  311. i.rb.insertDecomposed(i.buf[0:outp])
  312. return doNormDecomposed(i)
  313. }
  314. func doNormDecomposed(i *Iter) []byte {
  315. for {
  316. if s := i.rb.ss.next(i.info); s == ssOverflow {
  317. i.next = nextCGJDecompose
  318. break
  319. }
  320. i.rb.insertUnsafe(i.rb.src, i.p, i.info)
  321. if i.p += int(i.info.size); i.p >= i.rb.nsrc {
  322. i.setDone()
  323. break
  324. }
  325. i.info = i.rb.f.info(i.rb.src, i.p)
  326. if i.info.ccc == 0 {
  327. break
  328. }
  329. }
  330. // new segment or too many combining characters: exit normalization
  331. return i.buf[:i.rb.flushCopy(i.buf[:])]
  332. }
  333. func nextCGJDecompose(i *Iter) []byte {
  334. i.rb.ss = 0
  335. i.rb.insertCGJ()
  336. i.next = nextDecomposed
  337. buf := doNormDecomposed(i)
  338. return buf
  339. }
  340. // nextComposed is the implementation of Next for forms NFC and NFKC.
  341. func nextComposed(i *Iter) []byte {
  342. outp, startp := 0, i.p
  343. var prevCC uint8
  344. ss := mkStreamSafe(i.info)
  345. for {
  346. if !i.info.isYesC() {
  347. goto doNorm
  348. }
  349. prevCC = i.info.tccc
  350. sz := int(i.info.size)
  351. if sz == 0 {
  352. sz = 1 // illegal rune: copy byte-by-byte
  353. }
  354. p := outp + sz
  355. if p > len(i.buf) {
  356. break
  357. }
  358. outp = p
  359. i.p += sz
  360. if i.p >= i.rb.nsrc {
  361. i.setDone()
  362. break
  363. } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
  364. i.next = i.asciiF
  365. break
  366. }
  367. i.info = i.rb.f.info(i.rb.src, i.p)
  368. if v := ss.next(i.info); v == ssStarter {
  369. break
  370. } else if v == ssOverflow {
  371. i.next = nextCGJCompose
  372. break
  373. }
  374. if i.info.ccc < prevCC {
  375. goto doNorm
  376. }
  377. }
  378. return i.returnSlice(startp, i.p)
  379. doNorm:
  380. i.p = startp
  381. i.info = i.rb.f.info(i.rb.src, i.p)
  382. if i.info.multiSegment() {
  383. d := i.info.Decomposition()
  384. info := i.rb.f.info(input{bytes: d}, 0)
  385. i.rb.insertUnsafe(input{bytes: d}, 0, info)
  386. i.multiSeg = d[int(info.size):]
  387. i.next = nextMultiNorm
  388. return nextMultiNorm(i)
  389. }
  390. i.rb.ss.first(i.info)
  391. i.rb.insertUnsafe(i.rb.src, i.p, i.info)
  392. return doNormComposed(i)
  393. }
  394. func doNormComposed(i *Iter) []byte {
  395. // First rune should already be inserted.
  396. for {
  397. if i.p += int(i.info.size); i.p >= i.rb.nsrc {
  398. i.setDone()
  399. break
  400. }
  401. i.info = i.rb.f.info(i.rb.src, i.p)
  402. if s := i.rb.ss.next(i.info); s == ssStarter {
  403. break
  404. } else if s == ssOverflow {
  405. i.next = nextCGJCompose
  406. break
  407. }
  408. i.rb.insertUnsafe(i.rb.src, i.p, i.info)
  409. }
  410. i.rb.compose()
  411. seg := i.buf[:i.rb.flushCopy(i.buf[:])]
  412. return seg
  413. }
  414. func nextCGJCompose(i *Iter) []byte {
  415. i.rb.ss = 0 // instead of first
  416. i.rb.insertCGJ()
  417. i.next = nextComposed
  418. // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
  419. // even if they are not. This is particularly dubious for U+FF9E and UFF9A.
  420. // If we ever change that, insert a check here.
  421. i.rb.ss.first(i.info)
  422. i.rb.insertUnsafe(i.rb.src, i.p, i.info)
  423. return doNormComposed(i)
  424. }