264 lines
6.7 KiB
Raw Normal View History

2024-07-07 18:49:38 -07:00
'use strict'
var alphanumeric = require('is-alphanumerical')
var alphabetical = require('is-alphabetical')
var decimal = require('is-decimal')
var regular = require('./regular.json')
var normal = require('./normalize.json')
module.exports = parse
var own = {}.hasOwnProperty
// Parse a BCP 47 language tag.
/* eslint-disable-next-line complexity */
function parse(tag, options) {
var settings = options || {}
var result = empty()
var source = String(tag)
var value = source.toLowerCase()
var index = 0
var start
var groups
var offset
// Check input.
if (tag == null) {
throw new Error('Expected string, got `' + tag + '`')
// Lets start.
// First: the edge cases.
if (own.call(normal, value)) {
if ((settings.normalize == null || settings.normalize) && normal[value]) {
return parse(normal[value])
result[regular.indexOf(value) === -1 ? 'irregular' : 'regular'] = source
return result
// Now, to actually parse, eat what could be a language.
while (alphabetical(value.charCodeAt(index)) && index < 9) index++
// A language.
if (index > 1 /* Min 639. */ && index < 9 /* Max subtag. */) {
// 5 and up is a subtag.
// 4 is the size of reserved languages.
// 3 an ISO 639-2 or ISO 639-3.
// 2 is an ISO 639-1.
// <https://github.com/wooorm/iso-639-2>
// <https://github.com/wooorm/iso-639-3>
result.language = source.slice(0, index)
if (index < 4 /* Max 639. */) {
groups = 0
while (
value.charCodeAt(index) === 45 /* `-` */ &&
alphabetical(value.charCodeAt(index + 1)) &&
alphabetical(value.charCodeAt(index + 2)) &&
alphabetical(value.charCodeAt(index + 3)) &&
!alphabetical(value.charCodeAt(index + 4))
) {
if (groups > 2 /* Max extended language subtag count. */) {
return fail(
'Too many extended language subtags, expected at most 3 subtags'
// Extended language subtag.
result.extendedLanguageSubtags.push(source.slice(index + 1, index + 4))
index += 4
// ISO 15924 script.
// <https://github.com/wooorm/iso-15924>
if (
value.charCodeAt(index) === 45 /* `-` */ &&
alphabetical(value.charCodeAt(index + 1)) &&
alphabetical(value.charCodeAt(index + 2)) &&
alphabetical(value.charCodeAt(index + 3)) &&
alphabetical(value.charCodeAt(index + 4)) &&
!alphabetical(value.charCodeAt(index + 5))
) {
result.script = source.slice(index + 1, index + 5)
index += 5
if (value.charCodeAt(index) === 45 /* `-` */) {
// ISO 3166-1 region.
// <https://github.com/wooorm/iso-3166>
if (
alphabetical(value.charCodeAt(index + 1)) &&
alphabetical(value.charCodeAt(index + 2)) &&
!alphabetical(value.charCodeAt(index + 3))
) {
result.region = source.slice(index + 1, index + 3)
index += 3
// UN M49 region.
// <https://github.com/wooorm/un-m49>
else if (
decimal(value.charCodeAt(index + 1)) &&
decimal(value.charCodeAt(index + 2)) &&
decimal(value.charCodeAt(index + 3)) &&
!decimal(value.charCodeAt(index + 4))
) {
result.region = source.slice(index + 1, index + 4)
index += 4
while (value.charCodeAt(index) === 45 /* `-` */) {
offset = start = index + 1
while (alphanumeric(value.charCodeAt(offset))) {
if (offset - start > 7 /* Max variant. */) {
return fail(
'Too long variant, expected at most 8 characters'
if (
// Long variant.
offset - start > 4 /* Min alpha numeric variant. */ ||
// Short variant.
(offset - start > 3 /* Min variant. */ &&
) {
result.variants.push(source.slice(start, offset))
index = offset
// Something else.
else {
// Extensions.
while (value.charCodeAt(index) === 45 /* `-` */) {
// Exit if this isnt an extension.
if (
value.charCodeAt(index + 1) === 120 /* `x` */ ||
!alphanumeric(value.charCodeAt(index + 1)) ||
value.charCodeAt(index + 2) !== 45 /* `-` */ ||
!alphanumeric(value.charCodeAt(index + 3))
) {
offset = index + 2
groups = 0
while (
value.charCodeAt(offset) === 45 /* `-` */ &&
alphanumeric(value.charCodeAt(offset + 1)) &&
alphanumeric(value.charCodeAt(offset + 2))
) {
start = offset + 1
offset = start + 2
while (alphanumeric(value.charCodeAt(offset))) {
if (offset - start > 7 /* Max extension. */) {
return fail(
'Too long extension, expected at most 8 characters'
if (!groups) {
return fail(
'Empty extension, extensions must have at least 2 characters of content'
singleton: source.charAt(index + 1),
extensions: source.slice(index + 3, offset).split('-')
index = offset
// Not a language.
else {
index = 0
// Private use.
if (
(index === 0 && value.charCodeAt(index) === 120) /* `x` */ ||
(value.charCodeAt(index) === 45 /* `-` */ &&
value.charCodeAt(index + 1) === 120) /* `x` */
) {
offset = index = index ? index + 2 : 1
while (
value.charCodeAt(offset) === 45 /* `-` */ &&
alphanumeric(value.charCodeAt(offset + 1))
) {
offset = start = index + 1
while (alphanumeric(value.charCodeAt(offset))) {
if (offset - start > 7 /* Max private use. */) {
return fail(
'Too long private-use area, expected at most 8 characters'
result.privateuse.push(source.slice(index + 1, offset))
index = offset
if (index !== source.length) {
return fail(index, 6, 'Found superfluous content after tag')
return result
function fail(offset, code, reason) {
if (settings.warning) settings.warning(reason, code, offset)
return settings.forgiving ? result : empty()
// Create an empty results object.
function empty() {
return {
language: null,
extendedLanguageSubtags: [],
script: null,
region: null,
variants: [],
extensions: [],
privateuse: [],
irregular: null,
regular: null