diff --git a/CHANGELOG.md b/CHANGELOG.md index dfae592..70fda98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,12 @@ - `fontFeatureSettings`: Low-level control over OpenType features as a `Record` (e.g. `{ smcp: true, tnum: true }`). +- A `language` property (BCP 47 tag) on both the document definition + and text properties. The document-level language is written to the + PDF catalog (for accessibility). Fonts that provide language-specific + typographic behavior (e.g. for Turkish or Serbian) will use the + correct glyph forms for the specified language. + ### Changed - Replaced `pdf-lib` with `@ralfstx/pdf-core` as the underlying PDF diff --git a/README.md b/README.md index 0efcf21..d878d83 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,10 @@ text spans. The following text properties are supported: An object where each key is a four-character OpenType feature tag and the value enables (`true`) or disables (`false`) that feature (e.g. `{ smcp: true, tnum: true }`). +- `language`: The language of the text as a BCP 47 tag (e.g. `'en'`, + `'tr'`, `'sr'`). The language may influence text shaping in fonts + that provide language-specific typographic behavior. If not set, the + document's default language is used. ### Images @@ -571,6 +575,17 @@ text([ ... ``` +## Language + +The `language` property sets the default language of the document as a +BCP 47 tag (e.g. `'en'`, `'de'`, `'ar'`). It is written to the PDF +catalog for accessibility support. + +The language can also be set on individual text blocks or spans using +the `language` [text property](#text-properties). Fonts that provide +language-specific typographic behavior (e.g. for Turkish or Serbian) +will use the correct glyph forms based on the language of the text. + ## Metadata PDF documents can include metadata such as the title, author, subject, diff --git a/scripts/generate-language-tags.ts b/scripts/generate-language-tags.ts new file mode 100644 index 0000000..7eb4cb0 --- /dev/null +++ b/scripts/generate-language-tags.ts @@ -0,0 +1,315 @@ +/** + * Generates src/language-tags.gen.ts from the OpenType language system tag + * registry and the IANA BCP 47 subtag registry. + * + * The approach is based on HarfBuzz's gen-tag-table.py: + * https://github.com/harfbuzz/harfbuzz/blob/main/src/gen-tag-table.py + * We reuse the same strategy of parsing the OT registry HTML, resolving + * ISO 639-3 codes to 2-letter BCP 47 subtags, inheriting mappings from + * macrolanguages, ranking by code count to pick the most specific tag, + * and applying manual overrides for cases where the automated ranking + * disagrees with HarfBuzz (Norwegian, Chinese, Quechua, Malayalam). + * Our script is a simplified TypeScript rewrite that only produces the + * BCP 47 → OpenType direction and only maps primary language subtags. + * + * Usage: node scripts/generate-language-tags.ts + * + * Input: + * vendor.local/languagetags.html — OpenType language system tag registry + * downloaded from https://learn.microsoft.com/en-us/typography/opentype/spec/languagetags + * vendor.local/language-subtag-registry.txt — IANA BCP 47 subtag registry + * downloaded from https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry + * vendor.local/iso-639-3.tab — ISO 639-3 code table (for 3-letter to 2-letter mapping) + * downloaded from https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab + * Output: + * src/language-tags.gen.ts — BCP 47 to OpenType language tag mapping. + */ +/* eslint-disable no-console */ +import { readFileSync, writeFileSync } from 'node:fs'; +import { dirname, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const projectRoot = resolve(dirname(fileURLToPath(import.meta.url)), '..'); +const otRegistryPath = resolve(projectRoot, 'vendor.local/languagetags.html'); +const bcp47RegistryPath = resolve(projectRoot, 'vendor.local/language-subtag-registry.txt'); +const iso639Path = resolve(projectRoot, 'vendor.local/iso-639-3.tab'); +const outputPath = resolve(projectRoot, 'src/language-tags.gen.ts'); + +// Manual overrides matching HarfBuzz adjustments +const manualOverrides: Record = { + // Norwegian: HTML maps NOR only to 'nob'. HarfBuzz explicitly adds 'no' -> 'NOR'. + no: 'NOR', + // Chinese: Without overrides, 'zh' maps to all Chinese variants. + // HarfBuzz sets ZHS (Simplified) as the default for bare 'zh'. + zh: 'ZHS', + // Quechua: HarfBuzz removes QUZ from default and re-adds qu -> QUZ. + qu: 'QUZ', + // Malayalam: HarfBuzz increases MLR rank, making MAL (Traditional) preferred. + ml: 'MAL', +}; + +const otHtml = readFileSync(otRegistryPath, 'utf-8'); +const bcp47Content = readFileSync(bcp47RegistryPath, 'utf-8'); +const iso639Content = readFileSync(iso639Path, 'utf-8'); + +const otEntries = parseOtRegistry(otHtml); +const iso3to1 = parseIso639(iso639Content); +const macrolanguages = parseMacrolanguages(bcp47Content); +const mapping = buildMapping(otEntries, iso3to1, macrolanguages); + +const output = generate(mapping); +writeFileSync(outputPath, output); + +console.log(`Generated ${outputPath}`); +console.log(` ${mapping.size} language tag mappings`); + +// --------------------------------------------------------------------------- +// Parse OpenType language system tag registry (HTML) +// --------------------------------------------------------------------------- + +type OtEntry = { tag: string; isoCodes: string[] }; + +function parseOtRegistry(html: string): OtEntry[] { + const entries: OtEntry[] = []; + + // Match blocks containing 2-3 cells. + const trRegex = + /]*>\s*]*>([\s\S]*?)<\/td>\s*]*>([\s\S]*?)<\/td>\s*(?:]*>([\s\S]*?)<\/td>\s*)?<\/tr>/gi; + + let match; + while ((match = trRegex.exec(html)) !== null) { + const rawTag = match[2]; + const rawCodes = match[3]; + + const tag = parseOtTag(rawTag); + if (!tag) continue; + + const isoCodes = parseIsoCodes(rawCodes); + + entries.push({ tag, isoCodes }); + } + + if (entries.length < 100) { + throw new Error( + `Expected at least 100 OpenType entries, got ${entries.length}. ` + + 'The HTML structure may have changed.', + ); + } + + return entries; +} + +function parseOtTag(raw: string): string | undefined { + let text = stripHtml(raw).trim(); + // Skip deprecated tags + if (/\(deprecated\)/i.test(text)) return undefined; + // Strip surrounding single quotes used in the OT registry HTML (e.g. 'AFK ') + text = text.replace(/^'|'$/g, '').trim(); + if (!/^[A-Z]{3,4}$/i.test(text)) return undefined; + return text.toUpperCase(); +} + +function parseIsoCodes(raw: string | undefined): string[] { + if (!raw) return []; + // Take content before
(anything after is comments) + let text = raw.split(//i)[0]; + text = stripHtml(text).trim(); + if (!text) return []; + // Split on commas and whitespace, keep only valid ISO 639 codes + return text + .split(/[,\s]+/) + .map((s) => s.trim().toLowerCase()) + .filter((s) => /^[a-z]{2,3}$/.test(s)); +} + +function stripHtml(s: string): string { + return s + .replace(/<[^>]+>/g, '') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&#x([0-9a-f]+);/gi, (_, hex: string) => String.fromCodePoint(parseInt(hex, 16))) + .replace(/&#(\d+);/g, (_, dec: string) => String.fromCodePoint(parseInt(dec, 10))) + .replace(/ /gi, ' ') + .replace(/&[a-z]+;/gi, ''); +} + +// --------------------------------------------------------------------------- +// Parse ISO 639-3 table (3-letter Id -> 2-letter Part1) +// --------------------------------------------------------------------------- + +function parseIso639(content: string): Map { + const map = new Map(); + for (const line of content.split('\n').slice(1)) { + const cols = line.split('\t'); + const id = cols[0]; // 3-letter code + const part1 = cols[3]; // 2-letter code (may be empty) + if (id && part1) { + map.set(id, part1); + } + } + return map; +} + +// --------------------------------------------------------------------------- +// Parse IANA BCP 47 subtag registry (macrolanguage relationships only) +// --------------------------------------------------------------------------- + +function parseMacrolanguages(content: string): Map> { + const macrolanguages = new Map>(); + + for (const record of content.split('%%')) { + const fields = new Map(); + let currentKey = ''; + for (const line of record.split('\n')) { + if (line.startsWith(' ')) { + if (currentKey) { + fields.set(currentKey, (fields.get(currentKey) ?? '') + ' ' + line.trim()); + } + continue; + } + const m = line.match(/^([A-Za-z-]+):\s*(.*)$/); + if (m) { + currentKey = m[1]; + fields.set(currentKey, m[2].trim()); + } + } + + const type = fields.get('Type'); + const subtag = fields.get('Subtag'); + const macro = fields.get('Macrolanguage'); + + if (type === 'language' && subtag && macro) { + if (!macrolanguages.has(macro)) { + macrolanguages.set(macro, new Set()); + } + macrolanguages.get(macro)!.add(subtag); + } + } + + return macrolanguages; +} + +// --------------------------------------------------------------------------- +// Build BCP 47 (2-letter) -> OpenType tag mapping +// --------------------------------------------------------------------------- + +function buildMapping( + otEntries: OtEntry[], + iso3to1: Map, + macrolanguages: Map>, +): Map { + // Rank per OT tag: 2 × number of associated ISO codes. + // Lower rank = more specific = preferred. + const otTagRank = new Map(); + // BCP 47 2-letter code -> set of OT tags + const bcp47ToOt = new Map>(); + + for (const entry of otEntries) { + const bcp47Codes = new Set(); + for (const iso of entry.isoCodes) { + const twoLetter = iso3to1.get(iso) ?? (iso.length === 2 ? iso : undefined); + if (twoLetter && twoLetter.length === 2) { + bcp47Codes.add(twoLetter); + } + } + if (bcp47Codes.size === 0) continue; + + // Rank based on the number of associated ISO codes. + // Lower rank = more specific = preferred when multiple OT tags compete. + otTagRank.set(entry.tag, 2 * entry.isoCodes.length); + + for (const code of bcp47Codes) { + if (!bcp47ToOt.has(code)) { + bcp47ToOt.set(code, new Set()); + } + bcp47ToOt.get(code)!.add(entry.tag); + } + } + + // Macrolanguage inheritance: if a 2-letter macrolanguage code has no OT + // mapping, inherit from its member languages. + for (const [macro, members] of macrolanguages) { + const macro2 = iso3to1.get(macro) ?? (macro.length === 2 ? macro : undefined); + if (!macro2 || macro2.length !== 2) continue; + if (bcp47ToOt.has(macro2)) continue; + + const inherited = new Set(); + for (const member of members) { + const m2 = iso3to1.get(member) ?? (member.length === 2 ? member : undefined); + if (m2 && bcp47ToOt.has(m2)) { + for (const tag of bcp47ToOt.get(m2)!) { + inherited.add(tag); + } + } + } + if (inherited.size > 0) { + bcp47ToOt.set(macro2, inherited); + } + } + + // For each 2-letter code, pick the best OT tag (lowest rank, then alphabetical). + const result = new Map(); + for (const [bcp47, tags] of bcp47ToOt) { + if (bcp47.length !== 2) continue; + + const sorted = [...tags].sort((a, b) => { + const rankA = otTagRank.get(a) ?? 0; + const rankB = otTagRank.get(b) ?? 0; + if (rankA !== rankB) return rankA - rankB; + return a.localeCompare(b); + }); + result.set(bcp47, sorted[0]); + } + + // Apply manual overrides + for (const [bcp47, otTag] of Object.entries(manualOverrides)) { + result.set(bcp47, otTag); + } + + return result; +} + +// --------------------------------------------------------------------------- +// Generate output +// --------------------------------------------------------------------------- + +function generate(mapping: Map): string { + const sorted = [...mapping.entries()].sort((a, b) => a[0].localeCompare(b[0])); + + const lines: string[] = [ + '/**', + ' * Mapping from BCP 47 primary language subtags to OpenType language system tags.', + ' *', + ' * Generated from:', + ' * - OpenType language system tag registry (Microsoft)', + ' * - IANA BCP 47 subtag registry', + ' *', + ' * Source: https://learn.microsoft.com/en-us/typography/opentype/spec/languagetags', + ' *', + ' * Do not edit manually. Regenerate with: node scripts/generate-language-tags.ts', + ' */', + '', + 'const langSysTagMap = {', + ]; + + for (const [bcp47, otTag] of sorted) { + lines.push(` ${bcp47}: '${otTag}',`); + } + + lines.push('} as const;', ''); + lines.push('type LangKey = keyof typeof langSysTagMap;', ''); + lines.push('/**'); + lines.push(' * Map a BCP 47 language tag to an OpenType language system tag.'); + lines.push(' * Only the primary language subtag (the part before the first hyphen)'); + lines.push(' * is used for the lookup.'); + lines.push(' * Returns `undefined` for unmapped languages.'); + lines.push(' */'); + lines.push('export function languageToOpenTypeTag(language: string): string | undefined {'); + lines.push(" const primary = language.split('-')[0].toLowerCase() as LangKey;"); + lines.push(' return langSysTagMap[primary];'); + lines.push('}'); + lines.push(''); + + return lines.join('\n'); +} diff --git a/src/api/document.ts b/src/api/document.ts index 91fd8af..5eebbde 100644 --- a/src/api/document.ts +++ b/src/api/document.ts @@ -6,6 +6,16 @@ import type { TextProps } from './text.ts'; * The complete definition of a PDF document to create. */ export type DocumentDefinition = { + /** + * The default language of the document, expressed as a BCP 47 tag + * (e.g. `'en'`, `'de'`, `'ar'`, `'zh-Hans'`). + * + * Setting the language helps assistive technologies and improves + * language-aware text shaping when no more specific language is + * provided. + */ + language?: string; + /** * A content block that is printed at the top of each page. * A function can be passed to create page-specific headers. diff --git a/src/api/text.ts b/src/api/text.ts index cf1f68e..72c6539 100644 --- a/src/api/text.ts +++ b/src/api/text.ts @@ -138,4 +138,17 @@ export type TextProps = { * in `fontFeatureSettings`. */ fontFeatureSettings?: Record; + + /** + * The language of this text, expressed as a BCP 47 tag (e.g. `'en'`, + * `'tr'`, `'sr'`, `'ar'`). + * + * The language may influence text shaping in fonts that provide + * language-specific typographic behavior. For example, some fonts + * adjust glyph forms for Turkish or Serbian. + * + * If not set, the parent's language is used, or the document’s + * default language if none of the ancestors specify a language. + */ + language?: string; }; diff --git a/src/language-tags.gen.ts b/src/language-tags.gen.ts new file mode 100644 index 0000000..e1c8bea --- /dev/null +++ b/src/language-tags.gen.ts @@ -0,0 +1,208 @@ +/** + * Mapping from BCP 47 primary language subtags to OpenType language system tags. + * + * Generated from: + * - OpenType language system tag registry (Microsoft) + * - IANA BCP 47 subtag registry + * + * Source: https://learn.microsoft.com/en-us/typography/opentype/spec/languagetags + * + * Do not edit manually. Regenerate with: node scripts/generate-language-tags.ts + */ + +const langSysTagMap = { + aa: 'AFR', + ab: 'ABK', + af: 'AFK', + ak: 'AKA', + am: 'AMH', + an: 'ARG', + ar: 'ARA', + as: 'ASM', + av: 'AVR', + ay: 'AYM', + az: 'AZE', + ba: 'BSH', + be: 'BEL', + bg: 'BGR', + bi: 'BIS', + bm: 'BMB', + bn: 'BEN', + bo: 'TIB', + br: 'BRE', + bs: 'BOS', + ca: 'CAT', + ce: 'CHE', + ch: 'CHA', + co: 'COS', + cr: 'CRE', + cs: 'CSY', + cu: 'CSL', + cv: 'CHU', + cy: 'WEL', + da: 'DAN', + de: 'DEU', + dv: 'DIV', + dz: 'DZN', + ee: 'EWE', + el: 'ELL', + en: 'ENG', + eo: 'NTO', + es: 'ESP', + et: 'ETI', + eu: 'EUQ', + fa: 'FAR', + ff: 'FUL', + fi: 'FIN', + fj: 'FJI', + fo: 'FOS', + fr: 'FRA', + fy: 'FRI', + ga: 'IRI', + gd: 'GAE', + gl: 'GAL', + gn: 'GUA', + gu: 'GUJ', + gv: 'MNX', + ha: 'HAU', + he: 'IWR', + hi: 'HIN', + ho: 'HMO', + hr: 'HRV', + ht: 'HAI', + hu: 'HUN', + hy: 'HYE', + hz: 'HER', + ia: 'INA', + id: 'IND', + ie: 'ILE', + ig: 'IBO', + ii: 'YIM', + ik: 'IPK', + io: 'IDO', + is: 'ISL', + it: 'ITA', + iu: 'INU', + ja: 'JAN', + jv: 'JAV', + ka: 'KAT', + ki: 'KIK', + kj: 'KUA', + kk: 'KAZ', + kl: 'GRN', + km: 'KHM', + kn: 'KAN', + ko: 'KOR', + kr: 'KNR', + ks: 'KSH', + ku: 'KUR', + kv: 'KOM', + kw: 'COR', + ky: 'KIR', + la: 'LAT', + lb: 'LTZ', + lg: 'LUG', + li: 'LIM', + ln: 'LIN', + lo: 'LAO', + lt: 'LTH', + lu: 'LUB', + lv: 'LVI', + mg: 'MLG', + mh: 'MAH', + mi: 'MRI', + mk: 'MKD', + ml: 'MAL', + mn: 'MNG', + mr: 'MAR', + ms: 'MLY', + mt: 'MTS', + my: 'BRM', + na: 'NAU', + nb: 'NOR', + nd: 'NDB', + ne: 'NEP', + ng: 'NDG', + nl: 'NLD', + nn: 'NYN', + no: 'NOR', + nr: 'NDB', + nv: 'NAV', + ny: 'CHI', + oc: 'OCI', + oj: 'OJB', + om: 'ORO', + or: 'ORI', + os: 'OSS', + pa: 'PAN', + pi: 'PAL', + pl: 'PLK', + ps: 'PAS', + pt: 'PTG', + qu: 'QUZ', + rm: 'RMS', + rn: 'RUN', + ro: 'ROM', + ru: 'RUS', + rw: 'RUA', + sa: 'SAN', + sc: 'SRD', + sd: 'SND', + se: 'NSM', + sg: 'SGO', + sh: 'BOS', + si: 'SNH', + sk: 'SKY', + sl: 'SLV', + sm: 'SMO', + so: 'SML', + sq: 'SQI', + sr: 'SRB', + ss: 'SWZ', + st: 'SOT', + su: 'SUN', + sv: 'SVE', + sw: 'SWK', + ta: 'TAM', + te: 'TEL', + tg: 'TAJ', + th: 'THA', + ti: 'TGY', + tk: 'TKM', + tl: 'TGL', + tn: 'TNA', + to: 'TGN', + tr: 'TRK', + ts: 'TSG', + tt: 'TAT', + tw: 'TWI', + ty: 'THT', + ug: 'UYG', + uk: 'UKR', + ur: 'URD', + uz: 'UZB', + ve: 'VEN', + vi: 'VIT', + vo: 'VOL', + wa: 'WLN', + wo: 'WLF', + xh: 'XHS', + yi: 'JII', + yo: 'YBA', + za: 'ZHA', + zh: 'ZHS', + zu: 'ZUL', +} as const; + +type LangKey = keyof typeof langSysTagMap; + +/** + * Map a BCP 47 language tag to an OpenType language system tag. + * Only the primary language subtag (the part before the first hyphen) + * is used for the lookup. + * Returns `undefined` for unmapped languages. + */ +export function languageToOpenTypeTag(language: string): string | undefined { + const primary = language.split('-')[0].toLowerCase() as LangKey; + return langSysTagMap[primary]; +} diff --git a/src/language-tags.test.ts b/src/language-tags.test.ts new file mode 100644 index 0000000..5524e80 --- /dev/null +++ b/src/language-tags.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it } from 'vitest'; + +import { languageToOpenTypeTag } from './language-tags.gen.ts'; + +describe('languageToOpenTypeTag', () => { + it('maps English to ENG', () => { + expect(languageToOpenTypeTag('en')).toBe('ENG'); + }); + + it('maps German to DEU', () => { + expect(languageToOpenTypeTag('de')).toBe('DEU'); + }); + + it('maps Arabic to ARA', () => { + expect(languageToOpenTypeTag('ar')).toBe('ARA'); + }); + + it('maps Urdu to URD', () => { + expect(languageToOpenTypeTag('ur')).toBe('URD'); + }); + + it('maps Chinese to ZHS', () => { + expect(languageToOpenTypeTag('zh')).toBe('ZHS'); + }); + + it('maps Norwegian (no) to NOR', () => { + expect(languageToOpenTypeTag('no')).toBe('NOR'); + }); + + it('maps Norwegian Bokmål (nb) to NOR', () => { + expect(languageToOpenTypeTag('nb')).toBe('NOR'); + }); + + it('maps Norwegian Nynorsk (nn) to NYN', () => { + expect(languageToOpenTypeTag('nn')).toBe('NYN'); + }); + + it('maps Quechua to QUZ', () => { + expect(languageToOpenTypeTag('qu')).toBe('QUZ'); + }); + + it('maps Malayalam to MAL', () => { + expect(languageToOpenTypeTag('ml')).toBe('MAL'); + }); + + it('maps Malay to MLY', () => { + expect(languageToOpenTypeTag('ms')).toBe('MLY'); + }); + + it('extracts primary subtag from BCP 47 tag', () => { + expect(languageToOpenTypeTag('en-US')).toBe('ENG'); + expect(languageToOpenTypeTag('zh-Hans')).toBe('ZHS'); + expect(languageToOpenTypeTag('de-AT')).toBe('DEU'); + }); + + it('is case-insensitive', () => { + expect(languageToOpenTypeTag('EN')).toBe('ENG'); + expect(languageToOpenTypeTag('De')).toBe('DEU'); + }); + + it('returns undefined for unknown languages', () => { + expect(languageToOpenTypeTag('xx')).toBeUndefined(); + }); +}); diff --git a/src/read/read-block.test.ts b/src/read/read-block.test.ts index d1e414f..6c1958f 100644 --- a/src/read/read-block.test.ts +++ b/src/read/read-block.test.ts @@ -264,6 +264,50 @@ describe('readTextBlock', () => { ), ); }); + + it('includes language property', () => { + const input = { text: 'foo', language: 'de' }; + + const result = readTextBlock(input); + + expect(result.text).toEqual([{ text: 'foo', attrs: { language: 'de' } }]); + }); + + it('inherits language from default attrs', () => { + const input = { text: 'foo' }; + const defaultAttrs = { language: 'de' }; + + const result = readTextBlock(input, defaultAttrs); + + expect(result.text).toEqual([{ text: 'foo', attrs: { language: 'de' } }]); + }); + + it('allows span to override inherited language', () => { + const input = { text: { text: 'foo', language: 'fr' } }; + const defaultAttrs = { language: 'de' }; + + const result = readTextBlock(input, defaultAttrs); + + expect(result.text).toEqual([{ text: 'foo', attrs: { language: 'fr' } }]); + }); + + it('checks language type', () => { + const input = { text: [], language: 23 }; + + expect(() => readTextBlock(input)).toThrow( + new TypeError('Invalid value for "language": Expected string, got: 23'), + ); + }); + + it('rejects invalid language format', () => { + const input = { text: [], language: '123' }; + + expect(() => readTextBlock(input)).toThrow( + new TypeError( + 'Invalid value for "language": Expected string matching pattern /^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{1,8})*$/, got: \'123\'', + ), + ); + }); }); describe('readText', () => { diff --git a/src/read/read-block.ts b/src/read/read-block.ts index c0f9992..778d09b 100644 --- a/src/read/read-block.ts +++ b/src/read/read-block.ts @@ -62,6 +62,7 @@ export type TextAttrs = { fontKerning?: 'normal' | 'none'; fontVariantLigatures?: 'normal' | 'none'; fontFeatureSettings?: Record; + language?: string; }; type BlockAttrs = { @@ -201,6 +202,7 @@ export function readTextAttrs(input: Obj): TextAttrs { fontKerning: optional(types.string({ enum: ['normal', 'none'] })), fontVariantLigatures: optional(types.string({ enum: ['normal', 'none'] })), fontFeatureSettings: optional(readFontFeatureSettings), + language: optional(types.string({ pattern: /^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{1,8})*$/ })), }); if (!obj.fontWeight && obj.bold) { obj.fontWeight = 700; diff --git a/src/read/read-document.test.ts b/src/read/read-document.test.ts index d4d1dac..ca12d3c 100644 --- a/src/read/read-document.test.ts +++ b/src/read/read-document.test.ts @@ -150,4 +150,39 @@ describe('readDocumentDefinition', () => { new TypeError('Invalid value for "customData/bar": Expected string or Uint8Array, got: 23'), ); }); + + it('accepts language', () => { + const def = readDocumentDefinition({ ...input, language: 'de' }); + + expect(def.language).toBe('de'); + }); + + it('flows language into defaultStyle', () => { + const content = [{ text: 'foo' }]; + const def = readDocumentDefinition({ ...input, language: 'de', content }); + + expect(def.content).toEqual([{ text: [{ text: 'foo', attrs: { language: 'de' } }] }]); + }); + + it('does not override language in defaultStyle', () => { + const content = [{ text: 'foo' }]; + const defaultStyle = { language: 'fr' }; + const def = readDocumentDefinition({ ...input, language: 'de', defaultStyle, content }); + + expect(def.content).toEqual([{ text: [{ text: 'foo', attrs: { language: 'fr' } }] }]); + }); + + it('checks language type', () => { + expect(() => readDocumentDefinition({ ...input, language: 23 })).toThrow( + new TypeError('Invalid value for "language": Expected string, got: 23'), + ); + }); + + it('rejects invalid language format', () => { + expect(() => readDocumentDefinition({ ...input, language: '123' })).toThrow( + new TypeError( + 'Invalid value for "language": Expected string matching pattern /^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{1,8})*$/, got: \'123\'', + ), + ); + }); }); diff --git a/src/read/read-document.ts b/src/read/read-document.ts index 2946514..9464f55 100644 --- a/src/read/read-document.ts +++ b/src/read/read-document.ts @@ -20,6 +20,7 @@ import { parseOrientation, readPageSize } from './read-page-size.ts'; export type DocumentDefinition = { pageSize?: Size; pageOrientation?: 'portrait' | 'landscape'; + language?: string; info?: Metadata; defaultStyle?: TextAttrs; dev?: { guides?: boolean }; @@ -61,6 +62,7 @@ export function readDocumentDefinition(input: unknown): DocumentDefinition { const def1 = readObject(input, { pageSize: optional(readPageSize), pageOrientation: optional(parseOrientation), + language: optional(types.string({ pattern: /^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{1,8})*$/ })), info: optional(readInfo), defaultStyle: optional(readInheritableAttrs), dev: optional(types.object({ guides: optional(types.boolean()) })), @@ -68,6 +70,9 @@ export function readDocumentDefinition(input: unknown): DocumentDefinition { embeddedFiles: optional(types.array(readEmbeddedFiles)), onRenderDocument: optional(), }); + if (def1.language && !def1.defaultStyle?.language) { + def1.defaultStyle = { ...def1.defaultStyle, language: def1.language }; + } const tBlock = (block: unknown) => readBlock(block, def1.defaultStyle); const def2 = readObject(input, { margin: optional(dynamic(parseEdges)), diff --git a/src/render/render-document.test.ts b/src/render/render-document.test.ts index 4091e50..40e8872 100644 --- a/src/render/render-document.test.ts +++ b/src/render/render-document.test.ts @@ -86,6 +86,24 @@ describe('renderDocument', () => { expect(barStreamMatch![1]).toBe('\x01\x02\x03'); }); + it('sets Lang in catalog when language is specified', async () => { + const def = { content: [], language: 'de' }; + + const pdfData = await renderDocument(def, [], noObjectStreams); + const dataString = new TextDecoder().decode(pdfData); + + expect(dataString).toMatch(/\/Lang \(de\)/); + }); + + it('does not set Lang in catalog when language is not specified', async () => { + const def = { content: [] }; + + const pdfData = await renderDocument(def, [], noObjectStreams); + const dataString = new TextDecoder().decode(pdfData); + + expect(dataString).not.toMatch(/\/Lang/); + }); + it('calls custom render hook', async () => { const def = { content: [], diff --git a/src/render/render-document.ts b/src/render/render-document.ts index 86587df..e755be5 100644 --- a/src/render/render-document.ts +++ b/src/render/render-document.ts @@ -1,5 +1,5 @@ import type { PDFContext, PDFDict, WriteOptions } from '@ralfstx/pdf-core'; -import { PDFDocument, PDFStream } from '@ralfstx/pdf-core'; +import { PDFDocument, PDFStream, PDFString } from '@ralfstx/pdf-core'; import type { Page } from '../page.ts'; import type { DocumentDefinition, Metadata } from '../read/read-document.ts'; @@ -12,6 +12,9 @@ export async function renderDocument( ): Promise { const pdfDoc = new PDFDocument(); setMetadata(pdfDoc, def.info); + if (def.language) { + setLanguage(def.language, pdfDoc); + } if (def.customData) { setCustomData(def.customData, pdfDoc); } @@ -49,6 +52,13 @@ function setMetadata(doc: PDFDocument, info?: Metadata) { }); } +function setLanguage(language: string, doc: PDFDocument) { + doc.unsafeOnRender((renderContext) => { + const catalog = renderContext.catalog as PDFDict; + catalog.set('Lang', PDFString.of(language)); + }); +} + function setCustomData(data: Record, doc: PDFDocument) { for (const [key, value] of Object.entries(data)) { doc.unsafeOnRender((renderContext) => { diff --git a/src/text.test.ts b/src/text.test.ts index d2e0139..1fa5e74 100644 --- a/src/text.test.ts +++ b/src/text.test.ts @@ -187,6 +187,21 @@ describe('text', () => { expect(shapeTextSpy).toHaveBeenCalledWith('Мир', { scriptTag: 'cyrl' }); }); + it('passes langSysTag when language is set', async () => { + const shapeTextSpy = vi.fn(normalFont.shapeText.bind(normalFont)); + normalFont.shapeText = shapeTextSpy; + + await extractTextSegments( + [{ text: 'foo', attrs: { fontSize: 10, language: 'de' } }], + fontStore, + ); + + expect(shapeTextSpy).toHaveBeenCalledWith('foo', { + scriptTag: 'latn', + langSysTag: 'DEU', + }); + }); + it('preserves shaping properties on segments', async () => { const attrs = { fontSize: 10, @@ -284,6 +299,28 @@ describe('text', () => { it('returns undefined when no scriptTag and no features', () => { expect(buildShapeOptions({}, undefined)).toBeUndefined(); }); + + it('includes langSysTag when language is set', () => { + expect(buildShapeOptions({ language: 'de' })).toEqual({ langSysTag: 'DEU' }); + }); + + it('combines langSysTag with scriptTag', () => { + expect(buildShapeOptions({ language: 'de' }, 'latn')).toEqual({ + scriptTag: 'latn', + langSysTag: 'DEU', + }); + }); + + it('combines langSysTag with features', () => { + expect(buildShapeOptions({ language: 'de', fontKerning: 'none' })).toEqual({ + langSysTag: 'DEU', + features: { kern: false }, + }); + }); + + it('ignores unknown language', () => { + expect(buildShapeOptions({ language: 'xx' })).toBeUndefined(); + }); }); describe('splitChunks', () => { diff --git a/src/text.ts b/src/text.ts index f8ef00b..9124976 100644 --- a/src/text.ts +++ b/src/text.ts @@ -2,6 +2,7 @@ import type { PDFFont, ShapedGlyph } from '@ralfstx/pdf-core'; import type { FontStyle, FontWeight } from './api/text.ts'; import type { FontStore } from './font-store.ts'; +import { languageToOpenTypeTag } from './language-tags.gen.ts'; import type { TextAttrs, TextSpan } from './read/read-block.ts'; import type { Color } from './read/read-color.ts'; import { scriptToOpenTypeTag, segmentByScript } from './script-detection.ts'; @@ -288,8 +289,8 @@ function getTextHeight(font: PDFFont, fontSize: number): number { export function buildShapeOptions( attrs: TextAttrs, scriptTag?: string, -): { scriptTag?: string; features?: Record } | undefined { - const { fontKerning, fontVariantLigatures, fontFeatureSettings } = attrs; +): { scriptTag?: string; langSysTag?: string; features?: Record } | undefined { + const { fontKerning, fontVariantLigatures, fontFeatureSettings, language } = attrs; const features: Record = { ...fontFeatureSettings }; if (fontVariantLigatures === 'none') { features.liga = false; @@ -299,11 +300,14 @@ export function buildShapeOptions( if (fontKerning === 'none') { features.kern = false; } + const langSysTag = language ? languageToOpenTypeTag(language) : undefined; const hasFeatures = Object.keys(features).length > 0; const hasScriptTag = scriptTag != null && scriptTag !== 'DFLT'; - if (!hasFeatures && !hasScriptTag) return undefined; + const hasLangSysTag = langSysTag != null; + if (!hasFeatures && !hasScriptTag && !hasLangSysTag) return undefined; return { ...(hasScriptTag ? { scriptTag } : undefined), + ...(hasLangSysTag ? { langSysTag } : undefined), ...(hasFeatures ? { features } : undefined), }; }