123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120 |
- // Copyright 2015 Huan Du. All rights reserved.
- // Licensed under the MIT license that can be found in the LICENSE file.
- package xstrings
- import (
- "unicode"
- "unicode/utf8"
- )
- // Len returns str's utf8 rune length.
- func Len(str string) int {
- return utf8.RuneCountInString(str)
- }
- // WordCount returns number of words in a string.
- //
- // Word is defined as a locale dependent string containing alphabetic characters,
- // which may also contain but not start with `'` and `-` characters.
- func WordCount(str string) int {
- var r rune
- var size, n int
- inWord := false
- for len(str) > 0 {
- r, size = utf8.DecodeRuneInString(str)
- switch {
- case isAlphabet(r):
- if !inWord {
- inWord = true
- n++
- }
- case inWord && (r == '\'' || r == '-'):
- // Still in word.
- default:
- inWord = false
- }
- str = str[size:]
- }
- return n
- }
- const minCJKCharacter = '\u3400'
- // Checks r is a letter but not CJK character.
- func isAlphabet(r rune) bool {
- if !unicode.IsLetter(r) {
- return false
- }
- switch {
- // Quick check for non-CJK character.
- case r < minCJKCharacter:
- return true
- // Common CJK characters.
- case r >= '\u4E00' && r <= '\u9FCC':
- return false
- // Rare CJK characters.
- case r >= '\u3400' && r <= '\u4D85':
- return false
- // Rare and historic CJK characters.
- case r >= '\U00020000' && r <= '\U0002B81D':
- return false
- }
- return true
- }
- // Width returns string width in monotype font.
- // Multi-byte characters are usually twice the width of single byte characters.
- //
- // Algorithm comes from `mb_strwidth` in PHP.
- // http://php.net/manual/en/function.mb-strwidth.php
- func Width(str string) int {
- var r rune
- var size, n int
- for len(str) > 0 {
- r, size = utf8.DecodeRuneInString(str)
- n += RuneWidth(r)
- str = str[size:]
- }
- return n
- }
- // RuneWidth returns character width in monotype font.
- // Multi-byte characters are usually twice the width of single byte characters.
- //
- // Algorithm comes from `mb_strwidth` in PHP.
- // http://php.net/manual/en/function.mb-strwidth.php
- func RuneWidth(r rune) int {
- switch {
- case r == utf8.RuneError || r < '\x20':
- return 0
- case '\x20' <= r && r < '\u2000':
- return 1
- case '\u2000' <= r && r < '\uFF61':
- return 2
- case '\uFF61' <= r && r < '\uFFA0':
- return 1
- case '\uFFA0' <= r:
- return 2
- }
- return 0
- }
|