summaryrefslogtreecommitdiff
path: root/python/cjk.py
blob: 9a417d1bb69f81fdfae294266ab569d4d8578c2d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env python

# Chinese
ranges = [{"from":"\u4E00",  "to":"\u9FFF"},   # CJK Unified Ideographs
          {"from":"\u3400",  "to":"\u4DBF"},   # CJK Unified Ideographs Extension A
          #{"from":"\u20000", "to":"\u2A6DF"}, # CJK Unified Ideographs Extension B
          #{"from":"\u2A700", "to":"\u2B73F"}, # CJK Unified Ideographs Extension C
          #{"from":"\u2B740", "to":"\u2B81F"}, # CJK Unified Ideographs Extension D
          #{"from":"\u2B820", "to":"\u2CEAF"}, # CJK Unified Ideographs Extension E
          #{"from":"\u2B820", "to":"\u2CEAF"}, # CJK Unified Ideographs Extension F
          {"from":"\u2E80",  "to":"\u2EFF"},   # CJK Radicals Supplement
          {"from":"\u2F00",  "to":"\u2FDF"},   # Kangxi Radicals
          {"from":"\u2FF0",  "to":"\u2FFF"},   # Ideographic Description Characters
          {"from":"\u31C0",  "to":"\u31EF"},   # CJK Strokes
          {"from":"\u3200",  "to":"\u32FF"},   # Enclosed CJK Letters and Months
          {"from":"\u3300",  "to":"\u33FF"},   # CJK Compatibility
          {"from":"\uF900",  "to":"\uFAFF"},   # CJK Compatibility Ideographs
          #{"from":"\u1F200", "to":"\u1F2FF"}, # Enclosed Ideographic Supplement
          #{"from":"\u2F800", "to":"\u2FA1F"}  # CJK Compatibility Ideographs Supplement
          ]

# Japanese
ranges.extend([{"from":"\u3040", "to":"\u309F"},   # Hiragana
              #{"from":"\u1B100", "to":"\u1B12F"}, # Kana Extended-A
              #{"from":"\u1B000", "to":"\u1B0FF"}, # Kana Supplement
              {"from":"\u30A0",  "to":"\u30FF"},   # Katakana
              {"from":"\u31F0",  "to":"\u31FF"}])  # Katakana Phonetic Extensions

# Korean
ranges.extend([{"from":"\u1100", "to":"\u11FF"},  # Hangul Jamo
               {"from":"\uA960", "to":"\uA97F"},  # Hangul Jamo Extended-A
               {"from":"\uD7B0", "to":"\uD7FF"},  # Hangul Jamo Extended-B
               {"from":"\u3130", "to":"\u318F"},  # Hangul Compatibility Jamo
               {"from":"\uAC00", "to":"\uD7AF"}]) # Hangul Syllables

# Punctuation, etc.
ranges.extend([{"from":"\u3000",  "to":"\u303F"},   # CJK Symbols and Punctuation
               #{"from":"\u16FE0", "to":"\u16FFF"}, # Ideographic Symbols and Punctuation
               {"from":"\uFE30",  "to":"\uFE4F"},   # CJK Compatibility Forms 
               {"from":"\uFE50",  "to":"\uFE6F"},   # Small Form Variants
               {"from":"\uFE10",  "to":"\uFE1F"},   # Vertical Forms
               {"from":"\uFF00",  "to":"\uFFEF"}])  # Halfwidth and Fullwidth Forms

def is_cjk(char):
  return any([ord(range["from"]) <= ord(char) <= ord(range["to"]) for range in ranges])