Line data Source code
1 : /* -*- buffer-read-only: t -*- vi: set ro: */
2 : /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 : #line 1
4 : /* Determine a canonical name for the current locale's character encoding.
5 :
6 : Copyright (C) 2000-2006, 2008 Free Software Foundation, Inc.
7 :
8 : This program is free software; you can redistribute it and/or modify
9 : it under the terms of the GNU General Public License as published by
10 : the Free Software Foundation; either version 3, or (at your option)
11 : any later version.
12 :
13 : This program is distributed in the hope that it will be useful,
14 : but WITHOUT ANY WARRANTY; without even the implied warranty of
15 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 : GNU General Public License for more details.
17 :
18 : You should have received a copy of the GNU General Public License along
19 : with this program; if not, write to the Free Software Foundation,
20 : Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 :
22 : /* Written by Bruno Haible <bruno@clisp.org>. */
23 :
24 : #include <config.h>
25 :
26 : /* Specification. */
27 : #include "localcharset.h"
28 :
29 : #include <stddef.h>
30 : #include <stdio.h>
31 : #include <string.h>
32 : #include <stdlib.h>
33 :
34 : #if defined _WIN32 || defined __WIN32__
35 : # define WIN32_NATIVE
36 : #endif
37 :
38 : #if defined __EMX__
39 : /* Assume EMX program runs on OS/2, even if compiled under DOS. */
40 : # ifndef OS2
41 : # define OS2
42 : # endif
43 : #endif
44 :
45 : #if !defined WIN32_NATIVE
46 : # if HAVE_LANGINFO_CODESET
47 : # include <langinfo.h>
48 : # else
49 : # if 0 /* see comment below */
50 : # include <locale.h>
51 : # endif
52 : # endif
53 : # ifdef __CYGWIN__
54 : # define WIN32_LEAN_AND_MEAN
55 : # include <windows.h>
56 : # endif
57 : #elif defined WIN32_NATIVE
58 : # define WIN32_LEAN_AND_MEAN
59 : # include <windows.h>
60 : #endif
61 : #if defined OS2
62 : # define INCL_DOS
63 : # include <os2.h>
64 : #endif
65 :
66 : #if ENABLE_RELOCATABLE
67 : # include "relocatable.h"
68 : #else
69 : # define relocate(pathname) (pathname)
70 : #endif
71 :
72 : /* Get LIBDIR. */
73 : #ifndef LIBDIR
74 : # include "configmake.h"
75 : #endif
76 :
77 : #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
78 : /* Win32, Cygwin, OS/2, DOS */
79 : # define ISSLASH(C) ((C) == '/' || (C) == '\\')
80 : #endif
81 :
82 : #ifndef DIRECTORY_SEPARATOR
83 : # define DIRECTORY_SEPARATOR '/'
84 : #endif
85 :
86 : #ifndef ISSLASH
87 : # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
88 : #endif
89 :
90 : #if HAVE_DECL_GETC_UNLOCKED
91 : # undef getc
92 : # define getc getc_unlocked
93 : #endif
94 :
95 : /* The following static variable is declared 'volatile' to avoid a
96 : possible multithread problem in the function get_charset_aliases. If we
97 : are running in a threaded environment, and if two threads initialize
98 : 'charset_aliases' simultaneously, both will produce the same value,
99 : and everything will be ok if the two assignments to 'charset_aliases'
100 : are atomic. But I don't know what will happen if the two assignments mix. */
101 : #if __STDC__ != 1
102 : # define volatile /* empty */
103 : #endif
104 : /* Pointer to the contents of the charset.alias file, if it has already been
105 : read, else NULL. Its format is:
106 : ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
107 : static const char * volatile charset_aliases;
108 321 :
109 : /* Return a pointer to the contents of the charset.alias file. */
110 : static const char *
111 : get_charset_aliases (void)
112 321 : {
113 321 : const char *cp;
114 :
115 : cp = charset_aliases;
116 : if (cp == NULL)
117 : {
118 317 : #if !(defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
119 : FILE *fp;
120 : const char *dir;
121 : const char *base = "charset.alias";
122 : char *file_name;
123 317 :
124 317 : /* Make it possible to override the charset.alias location. This is
125 317 : necessary for running the testsuite before "make install". */
126 : dir = getenv ("CHARSETALIASDIR");
127 : if (dir == NULL || dir[0] == '\0')
128 : dir = relocate (LIBDIR);
129 317 :
130 317 : /* Concatenate dir and base into freshly allocated file_name. */
131 317 : {
132 317 : size_t dir_len = strlen (dir);
133 317 : size_t base_len = strlen (base);
134 : int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
135 317 : file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
136 317 : if (file_name != NULL)
137 317 : {
138 317 : memcpy (file_name, dir, dir_len);
139 : if (add_slash)
140 : file_name[dir_len] = DIRECTORY_SEPARATOR;
141 : memcpy (file_name + dir_len + add_slash, base, base_len + 1);
142 317 : }
143 : }
144 317 :
145 : if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
146 : /* Out of memory or file not found, treat it as empty. */
147 : cp = "";
148 0 : else
149 0 : {
150 : /* Parse the file's contents. */
151 : char *res_ptr = NULL;
152 0 : size_t res_size = 0;
153 :
154 : for (;;)
155 : {
156 : int c;
157 : char buf1[50+1];
158 : char buf2[50+1];
159 0 : size_t l1, l2;
160 0 : char *old_res_ptr;
161 0 :
162 0 : c = getc (fp);
163 0 : if (c == EOF)
164 0 : break;
165 : if (c == '\n' || c == ' ' || c == '\t')
166 : continue;
167 : if (c == '#')
168 0 : {
169 0 : /* Skip comment, to end of line. */
170 0 : do
171 0 : c = getc (fp);
172 0 : while (!(c == EOF || c == '\n'));
173 : if (c == EOF)
174 0 : break;
175 0 : continue;
176 0 : }
177 0 : ungetc (c, fp);
178 0 : if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
179 0 : break;
180 0 : l1 = strlen (buf1);
181 : l2 = strlen (buf2);
182 0 : old_res_ptr = res_ptr;
183 0 : if (res_size == 0)
184 : {
185 : res_size = l1 + 1 + l2 + 1;
186 : res_ptr = (char *) malloc (res_size + 1);
187 0 : }
188 0 : else
189 : {
190 0 : res_size += l1 + 1 + l2 + 1;
191 : res_ptr = (char *) realloc (res_ptr, res_size + 1);
192 : }
193 0 : if (res_ptr == NULL)
194 0 : {
195 0 : /* Out of memory. */
196 0 : res_size = 0;
197 : if (old_res_ptr != NULL)
198 0 : free (old_res_ptr);
199 0 : break;
200 : }
201 0 : strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
202 0 : strcpy (res_ptr + res_size - (l2 + 1), buf2);
203 0 : }
204 : fclose (fp);
205 : if (res_size == 0)
206 0 : cp = "";
207 0 : else
208 : {
209 : *(res_ptr + res_size) = '\0';
210 : cp = res_ptr;
211 317 : }
212 317 : }
213 :
214 : if (file_name != NULL)
215 : free (file_name);
216 :
217 : #else
218 :
219 : # if defined VMS
220 : /* To avoid the troubles of an extra file charset.alias_vms in the
221 : sources of many GNU packages, simply inline the aliases here. */
222 : /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
223 : "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
224 : section 10.7 "Handling Different Character Sets". */
225 : cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
226 : "ISO8859-2" "\0" "ISO-8859-2" "\0"
227 : "ISO8859-5" "\0" "ISO-8859-5" "\0"
228 : "ISO8859-7" "\0" "ISO-8859-7" "\0"
229 : "ISO8859-8" "\0" "ISO-8859-8" "\0"
230 : "ISO8859-9" "\0" "ISO-8859-9" "\0"
231 : /* Japanese */
232 : "eucJP" "\0" "EUC-JP" "\0"
233 : "SJIS" "\0" "SHIFT_JIS" "\0"
234 : "DECKANJI" "\0" "DEC-KANJI" "\0"
235 : "SDECKANJI" "\0" "EUC-JP" "\0"
236 : /* Chinese */
237 : "eucTW" "\0" "EUC-TW" "\0"
238 : "DECHANYU" "\0" "DEC-HANYU" "\0"
239 : "DECHANZI" "\0" "GB2312" "\0"
240 : /* Korean */
241 : "DECKOREAN" "\0" "EUC-KR" "\0";
242 : # endif
243 :
244 : # if defined WIN32_NATIVE || defined __CYGWIN__
245 : /* To avoid the troubles of installing a separate file in the same
246 : directory as the DLL and of retrieving the DLL's directory at
247 : runtime, simply inline the aliases here. */
248 :
249 : cp = "CP936" "\0" "GBK" "\0"
250 : "CP1361" "\0" "JOHAB" "\0"
251 : "CP20127" "\0" "ASCII" "\0"
252 : "CP20866" "\0" "KOI8-R" "\0"
253 : "CP20936" "\0" "GB2312" "\0"
254 : "CP21866" "\0" "KOI8-RU" "\0"
255 : "CP28591" "\0" "ISO-8859-1" "\0"
256 : "CP28592" "\0" "ISO-8859-2" "\0"
257 : "CP28593" "\0" "ISO-8859-3" "\0"
258 : "CP28594" "\0" "ISO-8859-4" "\0"
259 : "CP28595" "\0" "ISO-8859-5" "\0"
260 : "CP28596" "\0" "ISO-8859-6" "\0"
261 : "CP28597" "\0" "ISO-8859-7" "\0"
262 : "CP28598" "\0" "ISO-8859-8" "\0"
263 : "CP28599" "\0" "ISO-8859-9" "\0"
264 : "CP28605" "\0" "ISO-8859-15" "\0"
265 : "CP38598" "\0" "ISO-8859-8" "\0"
266 : "CP51932" "\0" "EUC-JP" "\0"
267 : "CP51936" "\0" "GB2312" "\0"
268 : "CP51949" "\0" "EUC-KR" "\0"
269 : "CP51950" "\0" "EUC-TW" "\0"
270 : "CP54936" "\0" "GB18030" "\0"
271 : "CP65001" "\0" "UTF-8" "\0";
272 317 : # endif
273 : #endif
274 :
275 321 : charset_aliases = cp;
276 : }
277 :
278 : return cp;
279 : }
280 :
281 : /* Determine the current locale's character encoding, and canonicalize it
282 : into one of the canonical names listed in config.charset.
283 : The result must not be freed; it is statically allocated.
284 : If the canonical name cannot be determined, the result is a non-canonical
285 : name. */
286 :
287 : #ifdef STATIC
288 321 : STATIC
289 : #endif
290 : const char *
291 : locale_charset (void)
292 : {
293 : const char *codeset;
294 : const char *aliases;
295 :
296 : #if !(defined WIN32_NATIVE || defined OS2)
297 :
298 321 : # if HAVE_LANGINFO_CODESET
299 :
300 : /* Most systems support nl_langinfo (CODESET) nowadays. */
301 : codeset = nl_langinfo (CODESET);
302 :
303 : # ifdef __CYGWIN__
304 : /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always
305 : returns "US-ASCII". As long as this is not fixed, return the suffix
306 : of the locale name from the environment variables (if present) or
307 : the codepage as a number. */
308 : if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
309 : {
310 : const char *locale;
311 : static char buf[2 + 10 + 1];
312 :
313 : locale = getenv ("LC_ALL");
314 : if (locale == NULL || locale[0] == '\0')
315 : {
316 : locale = getenv ("LC_CTYPE");
317 : if (locale == NULL || locale[0] == '\0')
318 : locale = getenv ("LANG");
319 : }
320 : if (locale != NULL && locale[0] != '\0')
321 : {
322 : /* If the locale name contains an encoding after the dot, return
323 : it. */
324 : const char *dot = strchr (locale, '.');
325 :
326 : if (dot != NULL)
327 : {
328 : const char *modifier;
329 :
330 : dot++;
331 : /* Look for the possible @... trailer and remove it, if any. */
332 : modifier = strchr (dot, '@');
333 : if (modifier == NULL)
334 : return dot;
335 : if (modifier - dot < sizeof (buf))
336 : {
337 : memcpy (buf, dot, modifier - dot);
338 : buf [modifier - dot] = '\0';
339 : return buf;
340 : }
341 : }
342 : }
343 :
344 : /* Woe32 has a function returning the locale's codepage as a number. */
345 : sprintf (buf, "CP%u", GetACP ());
346 : codeset = buf;
347 : }
348 : # endif
349 :
350 : # else
351 :
352 : /* On old systems which lack it, use setlocale or getenv. */
353 : const char *locale = NULL;
354 :
355 : /* But most old systems don't have a complete set of locales. Some
356 : (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
357 : use setlocale here; it would return "C" when it doesn't support the
358 : locale name the user has set. */
359 : # if 0
360 : locale = setlocale (LC_CTYPE, NULL);
361 : # endif
362 : if (locale == NULL || locale[0] == '\0')
363 : {
364 : locale = getenv ("LC_ALL");
365 : if (locale == NULL || locale[0] == '\0')
366 : {
367 : locale = getenv ("LC_CTYPE");
368 : if (locale == NULL || locale[0] == '\0')
369 : locale = getenv ("LANG");
370 : }
371 : }
372 :
373 : /* On some old systems, one used to set locale = "iso8859_1". On others,
374 : you set it to "language_COUNTRY.charset". In any case, we resolve it
375 : through the charset.alias file. */
376 : codeset = locale;
377 :
378 : # endif
379 :
380 : #elif defined WIN32_NATIVE
381 :
382 : static char buf[2 + 10 + 1];
383 :
384 : /* Woe32 has a function returning the locale's codepage as a number. */
385 : sprintf (buf, "CP%u", GetACP ());
386 : codeset = buf;
387 :
388 : #elif defined OS2
389 :
390 : const char *locale;
391 : static char buf[2 + 10 + 1];
392 : ULONG cp[3];
393 : ULONG cplen;
394 :
395 : /* Allow user to override the codeset, as set in the operating system,
396 : with standard language environment variables. */
397 : locale = getenv ("LC_ALL");
398 : if (locale == NULL || locale[0] == '\0')
399 : {
400 : locale = getenv ("LC_CTYPE");
401 : if (locale == NULL || locale[0] == '\0')
402 : locale = getenv ("LANG");
403 : }
404 : if (locale != NULL && locale[0] != '\0')
405 : {
406 : /* If the locale name contains an encoding after the dot, return it. */
407 : const char *dot = strchr (locale, '.');
408 :
409 : if (dot != NULL)
410 : {
411 : const char *modifier;
412 :
413 : dot++;
414 : /* Look for the possible @... trailer and remove it, if any. */
415 : modifier = strchr (dot, '@');
416 : if (modifier == NULL)
417 : return dot;
418 : if (modifier - dot < sizeof (buf))
419 : {
420 : memcpy (buf, dot, modifier - dot);
421 : buf [modifier - dot] = '\0';
422 : return buf;
423 : }
424 : }
425 :
426 : /* Resolve through the charset.alias file. */
427 : codeset = locale;
428 : }
429 : else
430 : {
431 : /* OS/2 has a function returning the locale's codepage as a number. */
432 : if (DosQueryCp (sizeof (cp), cp, &cplen))
433 : codeset = "";
434 : else
435 : {
436 : sprintf (buf, "CP%u", cp[0]);
437 : codeset = buf;
438 : }
439 : }
440 321 :
441 : #endif
442 0 :
443 : if (codeset == NULL)
444 : /* The canonical name cannot be determined. */
445 642 : codeset = "";
446 321 :
447 0 : /* Resolve alias. */
448 0 : for (aliases = get_charset_aliases ();
449 0 : *aliases != '\0';
450 : aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
451 0 : if (strcmp (codeset, aliases) == 0
452 0 : || (aliases[0] == '*' && aliases[1] == '\0'))
453 : {
454 : codeset = aliases + strlen (aliases) + 1;
455 : break;
456 : }
457 :
458 321 : /* Don't return an empty string. GNU libc and GNU libiconv interpret
459 0 : the empty string as denoting "the locale's character encoding",
460 : thus GNU libiconv would call this function a second time. */
461 321 : if (codeset[0] == '\0')
462 : codeset = "ASCII";
463 :
464 : return codeset;
465 : }
|