% tounicode.w
% Copyright 2006 Han The Thanh, <thanh@@pdftex.org>
% Copyright 2006-2010 Taco Hoekwater <taco@@luatex.org>
% This file is part of LuaTeX.
% LuaTeX is free software; you can redistribute it and/or modify it under
% the terms of the GNU General Public License as published by the Free
% Software Foundation; either version 2 of the License, or (at your
% option) any later version.
% LuaTeX is distributed in the hope that it will be useful, but WITHOUT
% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
% FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
% License for more details.
% You should have received a copy of the GNU General Public License along
% with LuaTeX; if not, see <
http://www.gnu.org/licenses/>.
@ @c
#include "ptexlib.h"
static const char _svn_version[] =
"$Id: tounicode.w 3967 2010-11-24 13:41:45Z taco $ "
"$URL:
http://foundry.supelec.fr/svn/luatex/tags/beta-0.70.1/source/texk/web2c/luatexdir/font/tounicode.w $";
@ @c
#define isXdigit(c) (isdigit(c) || ('A' <= (c) && (c) <= 'F'))
#define UNI_UNDEF -1
#define UNI_STRING -2 /* string allocated by |def_tounicode()| */
#define UNI_EXTRA_STRING -3 /* string allocated by |set_glyph_unicode()| */
static struct avl_table *glyph_unicode_tree = NULL;
static int comp_glyph_unicode_entry(const void *pa, const void *pb, void *p)
{
(void) p;
return strcmp(((const glyph_unicode_entry *) pa)->name,
((const glyph_unicode_entry *) pb)->name);
}
static glyph_unicode_entry *new_glyph_unicode_entry(void)
{
glyph_unicode_entry *e;
e = xtalloc(1, glyph_unicode_entry);
e->name = NULL;
e->code = UNI_UNDEF;
e->unicode_seq = NULL;
return e;
}
static void destroy_glyph_unicode_entry(void *pa, void *pb)
{
glyph_unicode_entry *e = (glyph_unicode_entry *) pa;
(void) pb;
xfree(e->name);
if (e->code == UNI_STRING) {
assert(e->unicode_seq != NULL);
xfree(e->unicode_seq);
}
}
void glyph_unicode_free(void)
{
if (glyph_unicode_tree != NULL)
avl_destroy(glyph_unicode_tree, destroy_glyph_unicode_entry);
}
@ @c
void def_tounicode(str_number glyph, str_number unistr)
{
char buf[SMALL_BUF_SIZE], *p, *ph;
char buf2[SMALL_BUF_SIZE], *q;
int valid_unistr; /* 0: invalid; 1: unicode value; 2: string */
int i, l;
glyph_unicode_entry *gu, t;
void **aa;
p = makecstring(glyph);
assert(strlen(p) < SMALL_BUF_SIZE);
strcpy(buf, p);
free(p);
p = makecstring(unistr);
ph = p;
while (*p == ' ')
p++; /* ignore leading spaces */
l = (int) strlen(p);
while (l > 0 && p[l - 1] == ' ')
l--; /* ignore traling spaces */
valid_unistr = 1; /* a unicode value is the most common case */
for (i = 0; i < l; i++) {
if (p[i] == ' ')
valid_unistr = 2; /* if a space occurs we treat this entry as a string */
else if (!isXdigit(p[i])) {
valid_unistr = 0;
break;
}
}
if (l == 0 || valid_unistr == 0 || strlen(buf) == 0
|| strcmp(buf, notdef) == 0) {
pdftex_warn("ToUnicode: invalid parameter(s): `%s' => `%s'", buf, p);
return;
}
if (glyph_unicode_tree == NULL) {
glyph_unicode_tree =
avl_create(comp_glyph_unicode_entry, NULL, &avl_xallocator);
assert(glyph_unicode_tree != NULL);
}
t.name = buf;
/* allow overriding existing entries */
if ((gu = (glyph_unicode_entry *) avl_find(glyph_unicode_tree, &t)) != NULL) {
if (gu->code == UNI_STRING) {
assert(gu->unicode_seq != NULL);
xfree(gu->unicode_seq);
}
} else { /* make new entry */
gu = new_glyph_unicode_entry();
gu->name = xstrdup(buf);
}
if (valid_unistr == 2) { /* a string with space(s) */
/* copy p to buf2, ignoring spaces */
for (q = buf2; *p != 0; p++)
if (*p != ' ')
*q++ = *p;
*q = 0;
gu->code = UNI_STRING;
gu->unicode_seq = xstrdup(buf2);
} else {
i = sscanf(p, "%lX", &(gu->code));
assert(i == 1);
}
aa = avl_probe(glyph_unicode_tree, gu);
assert(aa != NULL);
free(ph);
}
@ @c
static long check_unicode_value(char *s, boolean multiple_value)
{
int l = (int) strlen(s);
int i;
long code = 0; /* anything that is not |UNI_UNDEF| will do */
if (l == 0)
return UNI_UNDEF;
if (multiple_value && l % 4 != 0)
return UNI_UNDEF;
if (!multiple_value && !(4 <= l && l <= 6))
return UNI_UNDEF;
for (i = 0; i < l; i++) {
if (!isXdigit(s[i]))
return UNI_UNDEF;
if (multiple_value) {
if (i % 4 == 3) {
if (sscanf(s + i - 3, "%4lX", &code) != 1)
return UNI_UNDEF;
if (!((0x0000 <= code && code <= 0xD7FF) ||
(0xE000 <= code && code <= 0xFFFF)))
return UNI_UNDEF;
}
} else { /* single value */
if (i == l - 1) {
if (sscanf(s, "%lX", &code) != 1)
return UNI_UNDEF;
if (!((0x0000 <= code && code <= 0xD7FF) ||
(0xE000 <= code && code <= 0x10FFFF)))
return UNI_UNDEF;
}
}
}
return code;
}
@ @c
static char *utf16be_str(long code)
{
static char buf[SMALL_BUF_SIZE];
long v;
unsigned vh, vl;
assert(code >= 0);
if (code <= 0xFFFF)
sprintf(buf, "%04lX", code);
else {
v = code - 0x10000;
vh = (unsigned) (v / 0x400 + 0xD800);
vl = (unsigned) (v % 0x400 + 0xDC00);
sprintf(buf, "%04X%04X", vh, vl);
}
return buf;
}
@ This function set proper values to |*gp| based on |s|; in case it returns
|gp->code == UNI_EXTRA_STRING| then the caller is responsible for freeing
|gp->unicode_seq| too.
@c
static void set_glyph_unicode(char *s, glyph_unicode_entry * gp)
{
char buf[SMALL_BUF_SIZE], buf2[SMALL_BUF_SIZE], *p;
long code;
boolean last_component;
glyph_unicode_entry tmp, *ptmp;
/* skip dummy entries */
if (s == NULL || s == notdef)
return;
/* strip everything after the first dot */
p = strchr(s, '.');
if (p != NULL) {
*buf = 0;
strncat(buf, s, (size_t) (p - s));
s = buf;
}
if (strlen(s) == 0)
return;
/* check for case of multiple components separated by |'_'| */
p = strchr(s, '_');
if (p != NULL) {
assert(strlen(s) < sizeof(buf));
if (s != buf) {
strcpy(buf, s);
p = strchr(buf, '_');
s = buf;
}
*buf2 = 0;
last_component = false;
for (;;) {
*p = 0;
tmp.code = UNI_UNDEF;
set_glyph_unicode(s, &tmp);
switch (tmp.code) {
case UNI_UNDEF: /* not found, do nothing */
break;
case UNI_STRING: /* s matched an entry with string value in the database */
assert(tmp.unicode_seq != NULL);
assert(strlen(buf2) + strlen(tmp.unicode_seq) < sizeof(buf2));
strcat(buf2, tmp.unicode_seq);
break;
case UNI_EXTRA_STRING: /* s is a multiple value of form "uniXXXX" */
assert(strlen(buf2) + strlen(tmp.unicode_seq) < sizeof(buf2));
strcat(buf2, tmp.unicode_seq);
xfree(tmp.unicode_seq);
break;
default: /* s matched an entry with numeric value in the
database, or a value derived from "uXXXX" */
assert(tmp.code >= 0);
strcat(buf2, utf16be_str(tmp.code));
}
if (last_component)
break;
s = p + 1;
p = strchr(s, '_');
if (p == NULL) {
p = strend(s);
last_component = true;
}
}
gp->code = UNI_EXTRA_STRING;
gp->unicode_seq = xstrdup(buf2);
return;
}
/* lookup for glyph name in the database */
tmp.name = s;
tmp.code = UNI_UNDEF;
ptmp = (glyph_unicode_entry *) avl_find(glyph_unicode_tree, &tmp);
if (ptmp != NULL) {
gp->code = ptmp->code;
gp->unicode_seq = ptmp->unicode_seq;
return;
}
/* check for case of "uniXXXX" (multiple 4-hex-digit values allowed) */
if (str_prefix(s, "uni")) {
p = s + strlen("uni");
code = check_unicode_value(p, true);
if (code != UNI_UNDEF) {
if (strlen(p) == 4) /* single value */
gp->code = code;
else { /* multiple value */
gp->code = UNI_EXTRA_STRING;
gp->unicode_seq = xstrdup(p);
}
}
return; /* since the last case cannot happen */
}
/* check for case of "uXXXX" (single value up to 6 hex digits) */
if (str_prefix(s, "u")) {
p = s + strlen("u");
code = check_unicode_value(p, false);
if (code != UNI_UNDEF) {
assert(code >= 0);
gp->code = code;
}
}
}
@ @c
static void set_cid_glyph_unicode(long index, glyph_unicode_entry * gp,
internal_font_number f)
{
char *s;
if (font_tounicode(f) &&
(s = get_charinfo_tounicode(char_info(f, (int) index))) != NULL) {
gp->code = UNI_EXTRA_STRING;
gp->unicode_seq = xstrdup(s);
} else {
gp->code = index; /* fallback */
}
}
@ @c
int write_tounicode(PDF pdf, char **glyph_names, char *name)
{
char buf[SMALL_BUF_SIZE], *p;
static char builtin_suffix[] = "-builtin";
short range_size[257];
glyph_unicode_entry gtab[257];
int objnum;
int i, j;
int bfchar_count, bfrange_count, subrange_count;
assert(strlen(name) + strlen(builtin_suffix) < SMALL_BUF_SIZE);
if (glyph_unicode_tree == NULL) {
#ifdef DEBUG
pdftex_warn("no GlyphToUnicode entry has been inserted yet!");
#endif
pdf->gen_tounicode = 0;
return 0;
}
strcpy(buf, name);
if ((p = strrchr(buf, '.')) != NULL && strcmp(p, ".enc") == 0)
*p = 0; /* strip ".enc" from encoding name */
else
strcat(buf, builtin_suffix); /* ".enc" not present, this is a builtin
encoding so the name is eg "cmr10-builtin" */
objnum = pdf_new_objnum(pdf);
pdf_begin_dict(pdf, objnum, 0);
pdf_begin_stream(pdf);
pdf_printf(pdf, "%%!PS-Adobe-3.0 Resource-CMap\n"@/
"%%%%DocumentNeededResources: ProcSet (CIDInit)\n"@/
"%%%%IncludeResource: ProcSet (CIDInit)\n"@/
"%%%%BeginResource: CMap (TeX-%s-0)\n"@/
"%%%%Title: (TeX-%s-0 TeX %s 0)\n"@/
"%%%%Version: 1.000\n"@/
"%%%%EndComments\n"@/
"/CIDInit /ProcSet findresource begin\n"@/
"12 dict begin\n"@/
"begincmap\n"@/
"/CIDSystemInfo\n"@/
"<< /Registry (TeX)\n"@/
"/Ordering (%s)\n"@/
"/Supplement 0\n"@/
">> def\n"@/
"/CMapName /TeX-%s-0 def\n"@/
"/CMapType 2 def\n"@/
"1 begincodespacerange\n"@/
"<00> <FF>\n" "endcodespacerange\n", buf, buf, buf, buf, buf);
/* set gtab */
for (i = 0; i < 256; ++i) {
gtab[i].code = UNI_UNDEF;
set_glyph_unicode(glyph_names[i], >ab[i]);
}
gtab[256].code = UNI_UNDEF;
/* set |range_size| */
for (i = 0; i < 256;) {
if (gtab[i].code == UNI_STRING || gtab[i].code == UNI_EXTRA_STRING) {
range_size[i] = 1; /* single entry */
i++;
} else if (gtab[i].code == UNI_UNDEF) {
range_size[i] = 0; /* no entry */
i++;
} else { /* gtab[i].code >= 0 */
j = i;
while (i < 256 && gtab[i + 1].code >= 0 &&
gtab[i].code + 1 == gtab[i + 1].code)
i++;
/* at this point i is the last entry of the subrange */
i++; /* move i to the next entry */
range_size[j] = (short) (i - j);
}
}
/* calculate |bfrange_count| and |bfchar_count| */
bfrange_count = 0;
bfchar_count = 0;
for (i = 0; i < 256;) {
if (range_size[i] == 1) {
bfchar_count++;
i++;
} else if (range_size[i] > 1) {
bfrange_count++;
i += range_size[i];
} else
i++;
}
/* write out bfrange */
i = 0;
write_bfrange:
if (bfrange_count > 100)
subrange_count = 100;
else
subrange_count = bfrange_count;
bfrange_count -= subrange_count;
pdf_printf(pdf, "%i beginbfrange\n", subrange_count);
for (j = 0; j < subrange_count; j++) {
while (range_size[i] <= 1 && i < 256)
i++;
assert(i < 256);
pdf_printf(pdf, "<%02X> <%02X> <%s>\n", i, i + range_size[i] - 1,
utf16be_str(gtab[i].code));
i += range_size[i];
}
pdf_printf(pdf, "endbfrange\n");
if (bfrange_count > 0)
goto write_bfrange;
/* write out bfchar */
i = 0;
write_bfchar:
if (bfchar_count > 100)
subrange_count = 100;
else
subrange_count = bfchar_count;
bfchar_count -= subrange_count;
pdf_printf(pdf, "%i beginbfchar\n", subrange_count);
for (j = 0; j < subrange_count; j++) {
while (i < 256) {
if (range_size[i] > 1)
i += range_size[i];
else if (range_size[i] == 0)
i++;
else /* |range_size[i] == 1| */
break;
}
assert(i < 256 && gtab[i].code != UNI_UNDEF);
if (gtab[i].code == UNI_STRING || gtab[i].code == UNI_EXTRA_STRING) {
assert(gtab[i].unicode_seq != NULL);
pdf_printf(pdf, "<%02X> <%s>\n", i, gtab[i].unicode_seq);
} else
pdf_printf(pdf, "<%02X> <%s>\n", i, utf16be_str(gtab[i].code));
i++;
}
pdf_printf(pdf, "endbfchar\n");
if (bfchar_count > 0)
goto write_bfchar;
/* free strings allocated by |set_glyph_unicode()| */
for (i = 0; i < 256; ++i) {
if (gtab[i].code == UNI_EXTRA_STRING)
xfree(gtab[i].unicode_seq);
}
pdf_printf(pdf, "endcmap\n"
"CMapName currentdict /CMap defineresource pop\n"
"end\n" "end\n" "%%%%EndResource\n" "%%%%EOF\n");
pdf_end_stream(pdf);
return objnum;
}
@ @c
int write_cid_tounicode(PDF pdf, fo_entry * fo, internal_font_number f)
{
int range_size[65537];
glyph_unicode_entry gtab[65537];
int objnum;
int i, j, k;
int bfchar_count, bfrange_count, subrange_count;
char *buf;
assert(fo->fd->fontname);
buf = xmalloc((unsigned) (strlen(fo->fd->fontname) + 8));
sprintf(buf, "%s-%s",
(fo->fd->subset_tag != NULL ? fo->fd->subset_tag : "UCS"),
fo->fd->fontname);
objnum = pdf_new_objnum(pdf);
pdf_begin_dict(pdf, objnum, 0);
pdf_begin_stream(pdf);
pdf_printf(pdf, "%%!PS-Adobe-3.0 Resource-CMap\n"@/
"%%%%DocumentNeededResources: ProcSet (CIDInit)\n"@/
"%%%%IncludeResource: ProcSet (CIDInit)\n"@/
"%%%%BeginResource: CMap (TeX-%s-0)\n"@/
"%%%%Title: (TeX-%s-0 TeX %s 0)\n"@/
"%%%%Version: 1.000\n"@/
"%%%%EndComments\n"@/
"/CIDInit /ProcSet findresource begin\n"@/
"12 dict begin\n"@/
"begincmap\n"@/
"/CIDSystemInfo\n"@/
"<< /Registry (TeX)\n"@/
"/Ordering (%s)\n"@/
"/Supplement 0\n"@/
">> def\n"@/
"/CMapName /TeX-Identity-%s def\n"@/
"/CMapType 2 def\n"@/
"1 begincodespacerange\n"@/
"<0000> <FFFF>\n"@/
"endcodespacerange\n", buf, buf, buf, buf, buf);
xfree(buf);
/* set up gtab */
for (i = 0; i < 65537; ++i) {
gtab[i].code = UNI_UNDEF;
}
for (k = 1; k <= max_font_id(); k++) {
if (k == f || -f == pdf_font_num(k)) {
for (i = font_bc(k); i <= font_ec(k); i++) {
if (quick_char_exists(k, i) && char_used(k, i)) {
j = char_index(k, i);
if (gtab[j].code == UNI_UNDEF) {
set_cid_glyph_unicode(i, >ab[j], f);
}
}
}
}
}
/* set |range_size| */
for (i = 0; i < 65536;) {
if (gtab[i].code == UNI_STRING || gtab[i].code == UNI_EXTRA_STRING) {
range_size[i] = 1; /* single entry */
i++;
} else if (gtab[i].code == UNI_UNDEF) {
range_size[i] = 0; /* no entry */
i++;
} else { /* |gtab[i].code >= 0| */
j = i;
k = i % 256;
while (i < 65536 && k<255 && gtab[i + 1].code >= 0 &&
gtab[i].code + 1 == gtab[i + 1].code) {
i++; k++;
}
/* at this point i is the last entry of the subrange */
i++; /* move i to the next entry */
range_size[j] = i - j;
}
}
/* calculate |bfrange_count| and |bfchar_count| */
bfrange_count = 0;
bfchar_count = 0;
for (i = 0; i < 65536;) {
if (range_size[i] == 1) {
bfchar_count++;
i++;
} else if (range_size[i] > 1) {
bfrange_count++;
i += range_size[i];
} else
i++;
}
/* write out bfrange */
i = 0;
write_bfrange:
if (bfrange_count > 100)
subrange_count = 100;
else
subrange_count = bfrange_count;
bfrange_count -= subrange_count;
pdf_printf(pdf, "%i beginbfrange\n", subrange_count);
for (j = 0; j < subrange_count; j++) {
while (range_size[i] <= 1 && i < 65536)
i++;
assert(i < 65536);
pdf_printf(pdf, "<%04X> <%04X> <%s>\n", i, i + range_size[i] - 1,
utf16be_str(gtab[i].code));
i += range_size[i];
}
pdf_printf(pdf, "endbfrange\n");
if (bfrange_count > 0)
goto write_bfrange;
/* write out bfchar */
i = 0;
write_bfchar:
if (bfchar_count > 100)
subrange_count = 100;
else
subrange_count = bfchar_count;
bfchar_count -= subrange_count;
pdf_printf(pdf, "%i beginbfchar\n", subrange_count);
for (j = 0; j < subrange_count; j++) {
while (i < 65536) {
if (range_size[i] > 1)
i += range_size[i];
else if (range_size[i] == 0)
i++;
else /* |range_size[i] == 1| */
break;
}
assert(i < 65536 && gtab[i].code != UNI_UNDEF);
if (gtab[i].code == UNI_STRING || gtab[i].code == UNI_EXTRA_STRING) {
assert(gtab[i].unicode_seq != NULL);
pdf_printf(pdf, "<%04X> <%s>\n", i, gtab[i].unicode_seq);
} else
pdf_printf(pdf, "<%04X> <%s>\n", i, utf16be_str(gtab[i].code));
i++;
}
pdf_printf(pdf, "endbfchar\n");
if (bfchar_count > 0)
goto write_bfchar;
/* free strings allocated by |set_glyph_unicode()| */
for (i = 0; i < 65536; ++i) {
if (gtab[i].code == UNI_EXTRA_STRING)
xfree(gtab[i].unicode_seq);
}
pdf_printf(pdf, "endcmap\n"
"CMapName currentdict /CMap defineresource pop\n"
"end\n" "end\n" "%%%%EndResource\n" "%%%%EOF\n");
pdf_end_stream(pdf);
return objnum;
}