|
| 1 | +// Copyright (c) Microsoft Corporation. |
| 2 | +// Licensed under the MIT license. |
| 3 | + |
| 4 | +package sqlcmd |
| 5 | + |
| 6 | +import ( |
| 7 | + "sort" |
| 8 | + "strconv" |
| 9 | + "strings" |
| 10 | + |
| 11 | + "github.com/microsoft/go-sqlcmd/internal/localizer" |
| 12 | + "golang.org/x/text/encoding" |
| 13 | + "golang.org/x/text/encoding/charmap" |
| 14 | + "golang.org/x/text/encoding/japanese" |
| 15 | + "golang.org/x/text/encoding/korean" |
| 16 | + "golang.org/x/text/encoding/simplifiedchinese" |
| 17 | + "golang.org/x/text/encoding/traditionalchinese" |
| 18 | + "golang.org/x/text/encoding/unicode" |
| 19 | +) |
| 20 | + |
| 21 | +// codepageEntry defines a codepage with its encoding and metadata |
| 22 | +type codepageEntry struct { |
| 23 | + encoding encoding.Encoding // nil for UTF-8 (Go's native encoding) |
| 24 | + name string |
| 25 | + description string |
| 26 | +} |
| 27 | + |
| 28 | +// codepageRegistry is the single source of truth for all supported codepages |
| 29 | +// that work cross-platform. Both GetEncoding and SupportedCodePages use this |
| 30 | +// registry. On Windows, additional codepages installed on the system are also |
| 31 | +// available via the Windows API fallback in GetEncoding. |
| 32 | +var codepageRegistry = map[int]codepageEntry{ |
| 33 | + // Unicode |
| 34 | + 65001: {nil, "UTF-8", "Unicode (UTF-8)"}, |
| 35 | + 1200: {unicode.UTF16(unicode.LittleEndian, unicode.UseBOM), "UTF-16LE", "Unicode (UTF-16 Little-Endian)"}, |
| 36 | + 1201: {unicode.UTF16(unicode.BigEndian, unicode.UseBOM), "UTF-16BE", "Unicode (UTF-16 Big-Endian)"}, |
| 37 | + |
| 38 | + // OEM/DOS codepages |
| 39 | + 437: {charmap.CodePage437, "CP437", "OEM United States"}, |
| 40 | + 850: {charmap.CodePage850, "CP850", "OEM Multilingual Latin 1"}, |
| 41 | + 852: {charmap.CodePage852, "CP852", "OEM Latin 2"}, |
| 42 | + 855: {charmap.CodePage855, "CP855", "OEM Cyrillic"}, |
| 43 | + 858: {charmap.CodePage858, "CP858", "OEM Multilingual Latin 1 + Euro"}, |
| 44 | + 860: {charmap.CodePage860, "CP860", "OEM Portuguese"}, |
| 45 | + 862: {charmap.CodePage862, "CP862", "OEM Hebrew"}, |
| 46 | + 863: {charmap.CodePage863, "CP863", "OEM Canadian French"}, |
| 47 | + 865: {charmap.CodePage865, "CP865", "OEM Nordic"}, |
| 48 | + 866: {charmap.CodePage866, "CP866", "OEM Russian"}, |
| 49 | + |
| 50 | + // Windows codepages |
| 51 | + 874: {charmap.Windows874, "Windows-874", "Thai"}, |
| 52 | + 1250: {charmap.Windows1250, "Windows-1250", "Central European"}, |
| 53 | + 1251: {charmap.Windows1251, "Windows-1251", "Cyrillic"}, |
| 54 | + 1252: {charmap.Windows1252, "Windows-1252", "Western European"}, |
| 55 | + 1253: {charmap.Windows1253, "Windows-1253", "Greek"}, |
| 56 | + 1254: {charmap.Windows1254, "Windows-1254", "Turkish"}, |
| 57 | + 1255: {charmap.Windows1255, "Windows-1255", "Hebrew"}, |
| 58 | + 1256: {charmap.Windows1256, "Windows-1256", "Arabic"}, |
| 59 | + 1257: {charmap.Windows1257, "Windows-1257", "Baltic"}, |
| 60 | + 1258: {charmap.Windows1258, "Windows-1258", "Vietnamese"}, |
| 61 | + |
| 62 | + // ISO-8859 codepages |
| 63 | + 28591: {charmap.ISO8859_1, "ISO-8859-1", "Latin 1 (Western European)"}, |
| 64 | + 28592: {charmap.ISO8859_2, "ISO-8859-2", "Latin 2 (Central European)"}, |
| 65 | + 28593: {charmap.ISO8859_3, "ISO-8859-3", "Latin 3 (South European)"}, |
| 66 | + 28594: {charmap.ISO8859_4, "ISO-8859-4", "Latin 4 (North European)"}, |
| 67 | + 28595: {charmap.ISO8859_5, "ISO-8859-5", "Cyrillic"}, |
| 68 | + 28596: {charmap.ISO8859_6, "ISO-8859-6", "Arabic"}, |
| 69 | + 28597: {charmap.ISO8859_7, "ISO-8859-7", "Greek"}, |
| 70 | + 28598: {charmap.ISO8859_8, "ISO-8859-8", "Hebrew"}, |
| 71 | + 28599: {charmap.ISO8859_9, "ISO-8859-9", "Turkish"}, |
| 72 | + 28600: {charmap.ISO8859_10, "ISO-8859-10", "Nordic"}, |
| 73 | + 28603: {charmap.ISO8859_13, "ISO-8859-13", "Baltic"}, |
| 74 | + 28604: {charmap.ISO8859_14, "ISO-8859-14", "Celtic"}, |
| 75 | + 28605: {charmap.ISO8859_15, "ISO-8859-15", "Latin 9 (Western European with Euro)"}, |
| 76 | + 28606: {charmap.ISO8859_16, "ISO-8859-16", "Latin 10 (South-Eastern European)"}, |
| 77 | + |
| 78 | + // Cyrillic |
| 79 | + 20866: {charmap.KOI8R, "KOI8-R", "Russian"}, |
| 80 | + 21866: {charmap.KOI8U, "KOI8-U", "Ukrainian"}, |
| 81 | + |
| 82 | + // Macintosh |
| 83 | + 10000: {charmap.Macintosh, "Macintosh", "Mac Roman"}, |
| 84 | + 10007: {charmap.MacintoshCyrillic, "x-mac-cyrillic", "Mac Cyrillic"}, |
| 85 | + |
| 86 | + // EBCDIC |
| 87 | + 37: {charmap.CodePage037, "IBM037", "EBCDIC US-Canada"}, |
| 88 | + 1047: {charmap.CodePage1047, "IBM1047", "EBCDIC Latin 1/Open System"}, |
| 89 | + 1140: {charmap.CodePage1140, "IBM01140", "EBCDIC US-Canada with Euro"}, |
| 90 | + |
| 91 | + // Japanese |
| 92 | + 932: {japanese.ShiftJIS, "Shift_JIS", "Japanese (Shift-JIS)"}, |
| 93 | + 20932: {japanese.EUCJP, "EUC-JP", "Japanese (EUC)"}, |
| 94 | + 50220: {japanese.ISO2022JP, "ISO-2022-JP", "Japanese (JIS)"}, |
| 95 | + 50221: {japanese.ISO2022JP, "csISO2022JP", "Japanese (JIS-Allow 1 byte Kana)"}, |
| 96 | + 50222: {japanese.ISO2022JP, "ISO-2022-JP", "Japanese (JIS-Allow 1 byte Kana SO/SI)"}, |
| 97 | + |
| 98 | + // Korean |
| 99 | + 949: {korean.EUCKR, "EUC-KR", "Korean"}, |
| 100 | + 51949: {korean.EUCKR, "EUC-KR", "Korean (EUC)"}, |
| 101 | + |
| 102 | + // Simplified Chinese |
| 103 | + 936: {simplifiedchinese.GBK, "GBK", "Chinese Simplified (GBK)"}, |
| 104 | + 54936: {simplifiedchinese.GB18030, "GB18030", "Chinese Simplified (GB18030)"}, |
| 105 | + 52936: {simplifiedchinese.HZGB2312, "HZ-GB-2312", "Chinese Simplified (HZ)"}, |
| 106 | + |
| 107 | + // Traditional Chinese |
| 108 | + 950: {traditionalchinese.Big5, "Big5", "Chinese Traditional (Big5)"}, |
| 109 | +} |
| 110 | + |
| 111 | +// CodePageSettings holds the input and output codepage settings |
| 112 | +type CodePageSettings struct { |
| 113 | + InputCodePage int |
| 114 | + OutputCodePage int |
| 115 | +} |
| 116 | + |
| 117 | +// ParseCodePage parses the -f codepage argument |
| 118 | +// Format: codepage | i:codepage[,o:codepage] | o:codepage[,i:codepage] |
| 119 | +func ParseCodePage(arg string) (*CodePageSettings, error) { |
| 120 | + if arg == "" { |
| 121 | + return nil, nil |
| 122 | + } |
| 123 | + |
| 124 | + settings := &CodePageSettings{} |
| 125 | + parts := strings.Split(arg, ",") |
| 126 | + |
| 127 | + for _, part := range parts { |
| 128 | + part = strings.TrimSpace(part) |
| 129 | + if part == "" { |
| 130 | + continue |
| 131 | + } |
| 132 | + |
| 133 | + if strings.HasPrefix(strings.ToLower(part), "i:") { |
| 134 | + // Input codepage |
| 135 | + cp, err := strconv.Atoi(strings.TrimPrefix(strings.ToLower(part), "i:")) |
| 136 | + if err != nil { |
| 137 | + return nil, localizer.Errorf("invalid input codepage: %s", part) |
| 138 | + } |
| 139 | + settings.InputCodePage = cp |
| 140 | + } else if strings.HasPrefix(strings.ToLower(part), "o:") { |
| 141 | + // Output codepage |
| 142 | + cp, err := strconv.Atoi(strings.TrimPrefix(strings.ToLower(part), "o:")) |
| 143 | + if err != nil { |
| 144 | + return nil, localizer.Errorf("invalid output codepage: %s", part) |
| 145 | + } |
| 146 | + settings.OutputCodePage = cp |
| 147 | + } else { |
| 148 | + // Both input and output |
| 149 | + cp, err := strconv.Atoi(part) |
| 150 | + if err != nil { |
| 151 | + return nil, localizer.Errorf("invalid codepage: %s", part) |
| 152 | + } |
| 153 | + settings.InputCodePage = cp |
| 154 | + settings.OutputCodePage = cp |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + // If a non-empty argument was provided but no codepage was parsed, |
| 159 | + // treat this as an error rather than silently disabling codepage handling. |
| 160 | + if settings.InputCodePage == 0 && settings.OutputCodePage == 0 { |
| 161 | + return nil, localizer.Errorf("invalid codepage: %s", arg) |
| 162 | + } |
| 163 | + |
| 164 | + // Validate codepages |
| 165 | + if settings.InputCodePage != 0 { |
| 166 | + if _, err := GetEncoding(settings.InputCodePage); err != nil { |
| 167 | + return nil, err |
| 168 | + } |
| 169 | + } |
| 170 | + if settings.OutputCodePage != 0 { |
| 171 | + if _, err := GetEncoding(settings.OutputCodePage); err != nil { |
| 172 | + return nil, err |
| 173 | + } |
| 174 | + } |
| 175 | + |
| 176 | + return settings, nil |
| 177 | +} |
| 178 | + |
| 179 | +// GetEncoding returns the encoding for a given Windows codepage number. |
| 180 | +// Returns nil for UTF-8 (65001) since Go uses UTF-8 natively. |
| 181 | +// If the codepage is not in the built-in registry, falls back to |
| 182 | +// OS-specific support (Windows API on Windows, error on other platforms). |
| 183 | +func GetEncoding(codepage int) (encoding.Encoding, error) { |
| 184 | + entry, ok := codepageRegistry[codepage] |
| 185 | + if !ok { |
| 186 | + // Fallback to system-provided codepage support |
| 187 | + return getSystemCodePageEncoding(codepage) |
| 188 | + } |
| 189 | + return entry.encoding, nil |
| 190 | +} |
| 191 | + |
| 192 | +// CodePageInfo describes a supported codepage |
| 193 | +type CodePageInfo struct { |
| 194 | + CodePage int |
| 195 | + Name string |
| 196 | + Description string |
| 197 | +} |
| 198 | + |
| 199 | +// SupportedCodePages returns a list of all supported codepages with descriptions |
| 200 | +func SupportedCodePages() []CodePageInfo { |
| 201 | + result := make([]CodePageInfo, 0, len(codepageRegistry)) |
| 202 | + for cp, entry := range codepageRegistry { |
| 203 | + result = append(result, CodePageInfo{ |
| 204 | + CodePage: cp, |
| 205 | + Name: entry.name, |
| 206 | + Description: entry.description, |
| 207 | + }) |
| 208 | + } |
| 209 | + // Sort by codepage number for consistent output |
| 210 | + sort.Slice(result, func(i, j int) bool { |
| 211 | + return result[i].CodePage < result[j].CodePage |
| 212 | + }) |
| 213 | + return result |
| 214 | +} |
0 commit comments