This module provides support to handle the Unicode UTF-8 encoding.
There are no specialized insert, delete, add and contains procedures for seq[Rune] in this module because the generic variants of these procedures in the system module already work with it.
The current version is compatible with Unicode v12.0.0.
See also:
proc align(s: openArray[char]; count: Natural; padding = ' '.Rune): string {.
noSideEffect, ...gcsafe, extern: "nucAlignString", raises: [], tags: [],
forbids: [].}Aligns a unicode string s with padding, so that it has a rune-length of count.
padding characters (by default spaces) are added before s resulting in right alignment. If s.runelen >= count, no spaces are added and s is returned unchanged. If you need to left align a string use the alignLeft proc.
Example:
assert align("abc", 4) == " abc"
assert align("a", 0) == "a"
assert align("1232", 6) == " 1232"
assert align("1232", 6, '#'.Rune) == "##1232"
assert align("Åge", 5) == " Åge"
assert align("×", 4, '_'.Rune) == "___×" Source Edit proc align(s: string; count: Natural; padding = ' '.Rune): string {.
noSideEffect, inline, ...raises: [], tags: [], forbids: [].}Aligns a unicode string s with padding, so that it has a rune-length of count.
padding characters (by default spaces) are added before s resulting in right alignment. If s.runelen >= count, no spaces are added and s is returned unchanged. If you need to left align a string use the alignLeft proc.
Example:
assert align("abc", 4) == " abc"
assert align("a", 0) == "a"
assert align("1232", 6) == " 1232"
assert align("1232", 6, '#'.Rune) == "##1232"
assert align("Åge", 5) == " Åge"
assert align("×", 4, '_'.Rune) == "___×" Source Edit proc alignLeft(s: openArray[char]; count: Natural; padding = ' '.Rune): string {.
noSideEffect, ...raises: [], tags: [], forbids: [].}Left-aligns a unicode string s with padding, so that it has a rune-length of count.
padding characters (by default spaces) are added after s resulting in left alignment. If s.runelen >= count, no spaces are added and s is returned unchanged. If you need to right align a string use the align proc.
Example:
assert alignLeft("abc", 4) == "abc "
assert alignLeft("a", 0) == "a"
assert alignLeft("1232", 6) == "1232 "
assert alignLeft("1232", 6, '#'.Rune) == "1232##"
assert alignLeft("Åge", 5) == "Åge "
assert alignLeft("×", 4, '_'.Rune) == "×___" Source Edit proc alignLeft(s: string; count: Natural; padding = ' '.Rune): string {.
noSideEffect, inline, ...raises: [], tags: [], forbids: [].}Left-aligns a unicode string s with padding, so that it has a rune-length of count.
padding characters (by default spaces) are added after s resulting in left alignment. If s.runelen >= count, no spaces are added and s is returned unchanged. If you need to right align a string use the align proc.
Example:
assert alignLeft("abc", 4) == "abc "
assert alignLeft("a", 0) == "a"
assert alignLeft("1232", 6) == "1232 "
assert alignLeft("1232", 6, '#'.Rune) == "1232##"
assert alignLeft("Åge", 5) == "Åge "
assert alignLeft("×", 4, '_'.Rune) == "×___" Source Edit proc graphemeLen(s: openArray[char]; i: Natural): Natural {....raises: [],
tags: [], forbids: [].}s[i], including following combining code units. Example:
let a = "añyóng" doAssert a.graphemeLen(1) == 2 ## ñ doAssert a.graphemeLen(2) == 1 doAssert a.graphemeLen(4) == 2 ## óSource Edit
proc graphemeLen(s: string; i: Natural): Natural {.inline, ...raises: [], tags: [],
forbids: [].}s[i], including following combining code unit. Example:
let a = "añyóng" doAssert a.graphemeLen(1) == 2 ## ñ doAssert a.graphemeLen(2) == 1 doAssert a.graphemeLen(4) == 2 ## óSource Edit
proc isAlpha(c: Rune): bool {....gcsafe, extern: "nuc$1", raises: [], tags: [],
forbids: [].}Returns true if c is an alpha rune (i.e., a letter).
See also:
Source Editproc reversed(s: openArray[char]): string {....raises: [], tags: [], forbids: [].}Returns the reverse of s, interpreting it as runes.
Unicode combining characters are correctly interpreted as well.
Example:
assert reversed("Reverse this!") == "!siht esreveR"
assert reversed("先秦兩漢") == "漢兩秦先"
assert reversed("as⃝df̅") == "f̅ds⃝a"
assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞" Source Edit proc reversed(s: string): string {.inline, ...raises: [], tags: [], forbids: [].}Returns the reverse of s, interpreting it as runes.
Unicode combining characters are correctly interpreted as well.
Example:
assert reversed("Reverse this!") == "!siht esreveR"
assert reversed("先秦兩漢") == "漢兩秦先"
assert reversed("as⃝df̅") == "f̅ds⃝a"
assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞" Source Edit proc runeAt(s: openArray[char]; i: Natural): Rune {....raises: [], tags: [],
forbids: [].}Returns the rune in s at byte index i.
See also:
Example:
let a = "añyóng" doAssert a.runeAt(1) == "ñ".runeAt(0) doAssert a.runeAt(2) == "ñ".runeAt(1) doAssert a.runeAt(3) == "y".runeAt(0)Source Edit
proc runeAtPos(s: openArray[char]; pos: int): Rune {....raises: [], tags: [],
forbids: [].}Returns the rune at position pos.
Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Source Editproc runeAtPos(s: string; pos: int): Rune {.inline, ...raises: [], tags: [],
forbids: [].}Returns the rune at position pos.
Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Source Editproc runeLenAt(s: openArray[char]; i: Natural): int {....raises: [], tags: [],
forbids: [].}Returns the number of bytes the rune starting at s[i] takes.
See also:
Example:
let a = "añyóng" doAssert a.runeLenAt(0) == 1 doAssert a.runeLenAt(1) == 2Source Edit
proc runeOffset(s: openArray[char]; pos: Natural; start: Natural = 0): int {.
...raises: [], tags: [], forbids: [].}Returns the byte position of rune at position pos in s with an optional start byte position. Returns the special value -1 if it runs out of the string.
Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Example:
let a = "añyóng" doAssert a.runeOffset(1) == 1 doAssert a.runeOffset(3) == 4 doAssert a.runeOffset(4) == 6Source Edit
proc runeOffset(s: string; pos: Natural; start: Natural = 0): int {.inline,
...raises: [], tags: [], forbids: [].}Returns the byte position of rune at position pos in s with an optional start byte position. Returns the special value -1 if it runs out of the string.
Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Example:
let a = "añyóng" doAssert a.runeOffset(1) == 1 doAssert a.runeOffset(3) == 4 doAssert a.runeOffset(4) == 6Source Edit
proc runeReverseOffset(s: openArray[char]; rev: Positive): (int, int) {.
...raises: [], tags: [], forbids: [].}Returns a tuple with the byte offset of the rune at position rev in s, counting from the end (starting with 1) and the total number of runes in the string.
Returns a negative value for offset if there are too few runes in the string to satisfy the request.
Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Source Editproc runeReverseOffset(s: string; rev: Positive): (int, int) {.inline,
...raises: [], tags: [], forbids: [].}Returns a tuple with the byte offset of the rune at position rev in s, counting from the end (starting with 1) and the total number of runes in the string.
Returns a negative value for offset if there are too few runes in the string to satisfy the request.
Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Source Editproc runeStrAtPos(s: openArray[char]; pos: Natural): string {....raises: [],
tags: [], forbids: [].}Returns the rune at position pos as UTF8 String.
Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Source Editproc runeStrAtPos(s: string; pos: Natural): string {.inline, ...raises: [],
tags: [], forbids: [].}Returns the rune at position pos as UTF8 String.
Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Source Editproc runeSubStr(s: openArray[char]; pos: int; len: int = int.high): string {.
...raises: [], tags: [], forbids: [].}Returns the UTF-8 substring starting at code point pos with len code points.
If pos or len is negative they count from the end of the string. If len is not given it means the longest possible string.
Example:
let s = "Hänsel ««: 10,00€" doAssert(runeSubStr(s, 0, 2) == "Hä") doAssert(runeSubStr(s, 10, 1) == ":") doAssert(runeSubStr(s, -6) == "10,00€") doAssert(runeSubStr(s, 10) == ": 10,00€") doAssert(runeSubStr(s, 12, 5) == "10,00") doAssert(runeSubStr(s, -6, 3) == "10,")Source Edit
proc runeSubStr(s: string; pos: int; len: int = int.high): string {.inline,
...raises: [], tags: [], forbids: [].}Returns the UTF-8 substring starting at code point pos with len code points.
If pos or len is negative they count from the end of the string. If len is not given it means the longest possible string.
Example:
let s = "Hänsel ««: 10,00€" doAssert(runeSubStr(s, 0, 2) == "Hä") doAssert(runeSubStr(s, 10, 1) == ":") doAssert(runeSubStr(s, -6) == "10,00€") doAssert(runeSubStr(s, 10) == ": 10,00€") doAssert(runeSubStr(s, 12, 5) == "10,00") doAssert(runeSubStr(s, -6, 3) == "10,")Source Edit
proc split(s: openArray[char]; sep: Rune; maxsplit: int = -1): seq[string] {.
noSideEffect, ...gcsafe, extern: "nucSplitRune", raises: [], tags: [],
forbids: [].}proc split(s: openArray[char]; seps: openArray[Rune] = unicodeSpaces;
maxsplit: int = -1): seq[string] {.noSideEffect, ...gcsafe,
extern: "nucSplitRunes", raises: [], tags: [], forbids: [].}proc split(s: string; sep: Rune; maxsplit: int = -1): seq[string] {.
noSideEffect, inline, ...raises: [], tags: [], forbids: [].}proc split(s: string; seps: openArray[Rune] = unicodeSpaces; maxsplit: int = -1): seq[
string] {.noSideEffect, inline, ...raises: [], tags: [], forbids: [].}proc splitWhitespace(s: openArray[char]): seq[string] {.noSideEffect, ...gcsafe,
extern: "ncuSplitWhitespace", raises: [], tags: [], forbids: [].}proc splitWhitespace(s: string): seq[string] {.noSideEffect, inline, ...raises: [],
tags: [], forbids: [].}proc strip(s: openArray[char]; leading = true; trailing = true;
runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect,
...gcsafe, extern: "nucStrip", raises: [], tags: [], forbids: [].}Strips leading or trailing runes from s and returns the resulting string.
If leading is true (default), leading runes are stripped. If trailing is true (default), trailing runes are stripped. If both are false, the string is returned unchanged.
Example:
let a = "\táñyóng " doAssert a.strip == "áñyóng" doAssert a.strip(leading = false) == "\táñyóng" doAssert a.strip(trailing = false) == "áñyóng "Source Edit
proc strip(s: string; leading = true; trailing = true;
runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect,
inline, ...raises: [], tags: [], forbids: [].}Strips leading or trailing runes from s and returns the resulting string.
If leading is true (default), leading runes are stripped. If trailing is true (default), trailing runes are stripped. If both are false, the string is returned unchanged.
Example:
let a = "\táñyóng " doAssert a.strip == "áñyóng" doAssert a.strip(leading = false) == "\táñyóng" doAssert a.strip(trailing = false) == "áñyóng "Source Edit
proc swapCase(s: openArray[char]): string {.noSideEffect, ...gcsafe,
extern: "nuc$1", raises: [], tags: [], forbids: [].}Swaps the case of runes in s.
Returns a new string such that the cases of all runes are swapped if possible.
Example:
doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA" Source Edit proc title(s: openArray[char]): string {.noSideEffect, ...gcsafe, extern: "nuc$1",
raises: [], tags: [], forbids: [].}Converts s to a unicode title.
Returns a new string such that the first character in each word inside s is capitalized.
Example:
doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma" Source Edit proc toLower(c: Rune): Rune {....gcsafe, extern: "nuc$1", raises: [], tags: [],
forbids: [].}Converts c into lower case. This works for any rune.
If possible, prefer toLower over toUpper.
See also:
Source Editproc toUpper(c: Rune): Rune {....gcsafe, extern: "nuc$1", raises: [], tags: [],
forbids: [].}Converts c into upper case. This works for any rune.
If possible, prefer toLower over toUpper.
See also:
Source Editproc toUTF8(c: Rune): string {....gcsafe, extern: "nuc$1", raises: [], tags: [],
forbids: [].}Converts a rune into its UTF-8 representation.
See also:
toUTF8
Example:
let a = "añyóng" doAssert a.runeAt(1).toUTF8 == "ñ"Source Edit
proc translate(s: openArray[char]; replacements: proc (key: string): string): string {.
...gcsafe, extern: "nuc$1", effectsOf: replacements, ...raises: [], tags: [],
forbids: [].}Translates words in a string using the replacements proc to substitute words inside s with their replacements.
replacements is any proc that takes a word and returns a new word to fill it's place.
Example:
proc wordToNumber(s: string): string = case s of "one": "1" of "two": "2" else: s let a = "one two three four" doAssert a.translate(wordToNumber) == "1 2 three four"Source Edit
proc translate(s: string; replacements: proc (key: string): string): string {.
effectsOf: replacements, inline, ...raises: [], tags: [], forbids: [].}Translates words in a string using the replacements proc to substitute words inside s with their replacements.
replacements is any proc that takes a word and returns a new word to fill it's place.
Example:
proc wordToNumber(s: string): string = case s of "one": "1" of "two": "2" else: s let a = "one two three four" doAssert a.translate(wordToNumber) == "1 2 three four"Source Edit
proc validateUtf8(s: openArray[char]): int {....raises: [], tags: [], forbids: [].}Returns the position of the invalid byte in s if the string s does not hold valid UTF-8 data. Otherwise -1 is returned.
See also:
toUTF8
proc validateUtf8(s: string): int {.inline, ...raises: [], tags: [], forbids: [].}Returns the position of the invalid byte in s if the string s does not hold valid UTF-8 data. Otherwise -1 is returned.
See also:
toUTF8
iterator split(s: openArray[char]; sep: Rune; maxsplit: int = -1): string {.
...raises: [], tags: [], forbids: [].}s into substrings using a single separator. Substrings are separated by the rune sep. Example:
import std/sequtils
assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) ==
@["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"] Source Edit iterator split(s: openArray[char]; seps: openArray[Rune] = unicodeSpaces;
maxsplit: int = -1): string {....raises: [], tags: [], forbids: [].}Splits the unicode string s into substrings using a group of separators.
Substrings are separated by a substring containing only seps.
Example:
import std/sequtils
assert toSeq("hÃllo\lthis\lis an\texample\l是".split) ==
@["hÃllo", "this", "is", "an", "example", "是"]
# And the following code splits the same string using a sequence of Runes.
assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) ==
@["añyóng", "hÃllo", "是", "example"]
# example with a `Rune` separator and unused one `;`:
assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""]
# Another example that splits a string containing a date.
let date = "2012-11-20T22:08:08.398990"
assert toSeq(split(date, " -:T".toRunes)) ==
@["2012", "11", "20", "22", "08", "08.398990"] Source Edit iterator split(s: string; sep: Rune; maxsplit: int = -1): string {....raises: [],
tags: [], forbids: [].}s into substrings using a single separator. Substrings are separated by the rune sep. Example:
import std/sequtils
assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) ==
@["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"] Source Edit iterator split(s: string; seps: openArray[Rune] = unicodeSpaces;
maxsplit: int = -1): string {....raises: [], tags: [], forbids: [].}Splits the unicode string s into substrings using a group of separators.
Substrings are separated by a substring containing only seps.
Example:
import std/sequtils
assert toSeq("hÃllo\lthis\lis an\texample\l是".split) ==
@["hÃllo", "this", "is", "an", "example", "是"]
# And the following code splits the same string using a sequence of Runes.
assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) ==
@["añyóng", "hÃllo", "是", "example"]
# example with a `Rune` separator and unused one `;`:
assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""]
# Another example that splits a string containing a date.
let date = "2012-11-20T22:08:08.398990"
assert toSeq(split(date, " -:T".toRunes)) ==
@["2012", "11", "20", "22", "08", "08.398990"] Source Edit iterator utf8(s: openArray[char]): string {....raises: [], tags: [], forbids: [].}Iterates over any rune of the string s returning utf8 values.
See also:
toUTF8
iterator utf8(s: string): string {....raises: [], tags: [], forbids: [].}Iterates over any rune of the string s returning utf8 values.
See also:
toUTF8
template fastToUTF8Copy(c: Rune; s: var string; pos: int; doInc = true)
Copies UTF-8 representation of c into the preallocated string s starting at position pos.
If doInc == true (default), pos is incremented by the number of bytes that have been processed.
To be the most efficient, make sure s is preallocated with an additional amount equal to the byte length of c.
See also:
toUTF8
© 2006–2024 Andreas Rumpf
Licensed under the MIT License.
https://nim-lang.org/docs/unicode.html