User:Verdy p/tablesort.js

From Wikipedia, the free encyclopedia
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/* Please DON'T delete, this is only in MY own user space, and it is NOT a test. See rule: [[Wikipedia:CSD#G2]] */
/* <pre> */
/* =====================================================================
Support for sorting text cells of tables in a much more consistent way.
(c) 2010 Philippe Verdy : [[fr:Utilisateur:Verdy_p]] (http://fr.wikipedia.org/wiki/Utilisateur:Verdy_p).

Available under the following licences: {{LGPL}} {{GFDL}} {{CC-BY-SA}} {{CeCILL}}
(see details about these licence templates on http://commons.mediawiki.org)

This implements a 2-level collation, nearly compatible with UCA (see UTS#10 and CLDR for tailoring), but:
* It is still using a the binary ordering of characters used in subkeys at each level
  (instead of their preferred collation weight).
* Ternary differences are all left in the last default level that uses the binary ordering of characters
  (actually it uses the ordering of UTF-16 code units, instead of code point values of UTF-32).
  This has the effect of sorting "a-a" before "aa", because they have the same primary and secundary
  subkeys, and they differ only on the ternary level.
* It is incomplete in the decomposition mappings of graphemes.
* It is almost completely locale-neutral (not tailored for any different locales), except that it contains
  a very limited support for French reverse ordering of the ternary level.

But despite these defects, it gives much better results than the default binary order of UTF-16 code units
(the only option natively implemented in Javascript which makes it a level-0 collation), or than a
simple 1-level collation with case mapping only (currently used in wikibits.js).

A complete UCA implementation, using true weights from the Unicode DUCET and possible per-locale
tailoring would require much more work (but would produce more compact sort keys). This implementation
is very fast and its code is simple enough and small, that it can be a full replacement of the current
ts_toLowerCase() function used to generate sort keys for text cells.

It does not impact the existing non-text (number, currency, date) ordering types of Mediawiki tables.
======================================================================= */

/* Utility */
function ts_toOtherCase(s) {
   var c, d;
   s = s.split('');
   for (var i = s.length; --i >= 0;) s[i] = (c = s[i]) == (d = c.toLowerCase()) ? c.toUpperCase() : d;
   return s.join('');
}

/* ===================================================================== */

ts_textToKeyReversed = mw.config.get("wgContentLanguage") == 'fr';

/* Grapheme_Cluster_Break property of Unicode:
  (see UAX #29 3.1 Default Grapheme Cluster Boundary Specification)
  This includes Grapheme_Extend = true, i.e.:
  * General_Category = Nonspacing_Mark
  * General_Category = Enclosing_Mark
  * U+200C ZERO WIDTH NON-JOINER (ZWNJ) // TODO: can't be unsupported here
  * U+200D ZERO WIDTH JOINER (ZWJ) // TODO: can't be unsupported here
  * plus a few Spacing Marks needed for canonical equivalence.
  as well as the following prepended characters (that don't follow the logical order):
  * U+0E30 ( ะ ) THAI CHARACTER SARA A
  * U+0E32 ( า ) THAI CHARACTER SARA AA
  * U+0E33 ( ำ ) THAI CHARACTER SARA AM
  * U+0E45 ( ๅ ) THAI CHARACTER LAKKHANGYAO
  * U+0EB0 ( ະ ) LAO VOWEL SIGN A
  * U+0EB2 ( າ ) LAO VOWEL SIGN AA
  * U+0EB3 ( ຳ ) LAO VOWEL SIGN AM
  FIXME: need support for graphemes longer than 2 characters (notably with ZWJ/ZWNJ).
  FIXME: check Unicode 5/6 completeness.
*/
ts_textToKeyGraphemeExtends =
  '\u0300-\u036F' +
  '\u0483-\u0489' +
  '\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7' +
  '\u0610-\u061A\u064B-\u065E\u0670\u06D6-\u06DC\u06DE-\u06E4\u06E7\u06E8\u06EA-\u06ED' +
  '\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3' +
  '\u0901-\u0903\u093C\u093E-\u094D\u0951-\u0954\u0962\u0963' +
  '\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3' +
  '\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75' +
  '\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3' +
  '\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B62\u0B63' +
  '\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7' +
  '\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63' +
  '\u0C82\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3' +
  '\u0D02\u0D03\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63' +
  '\u0D82\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3' +
  '\u0E30-\u0E3A\u0E45\u0E47-\u0E4E' + /* including '\u0E30\u0E32\u0E33\u0E45' (see notice above) */
  '\u0EB0-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD' + /* including '\u0EB0\u0EB2\u0EB3' (see notice above) */
  '\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F90-\u0F97\u0F99-\u0FBC\u0FC6' +
  '\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F' +
  '\u135F' +
  '\u1712-\u1714\u1732-\u1734\u1752\u1753\u1772\u1773\u17B6-\u17D3\u17DD' +
  '\u180B-\u180D\u18A9' +
  '\u1920-\u192B\u1930-\u193B\u19B0-\u19C0\u19C8\u19C9' +
  '\u1A17-\u1A1B' +
  '\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAA' +
  '\u1C24-\u1C37' +
  '\u1DC0-\u1DE6\u1DFE\u1DFF' +
  '\u20D0-\u20F0' +
  '\u2DE0-\u2DFF' +
  '\u302A-\u302F\u3099\u309A' +
  '\uA66F-\uA672\uA67C\uA67D' +
  '\uA802\uA806\uA80B\uA823-\uA827\uA880\uA881\uA8B4-\uA8C4' +
  '\uA926-\uA92D\uA947-\uA953' +
  '\uAA29-\uAA36\uAA43\uAA4C\uAA4D' +
  '\uFB1E' +
  '\uFE00-\uFE0F\uFE20-\uFE26';

/*
  The "Prepend" characters (not obeying the logical structure) should be handled.
  (see UAX #29 3.1 Default Grapheme Cluster Boundary Specification).
*/
ts_textToKeyGraphemePrepends =
  '\u0E40-\u0E44\u0EC0-\u0EC4';

/*
  FIXME: check the missing currency symbols or non-separating punctuations to ignore.
  TODO: add the missing format controls to ignore.
*/
ts_textToKeyDropped = new RegExp(
   '[-\'\x7F-\xBF\xAD' +
    '⁃·´`‚‘’~^†‡¶§©®$£€¢₩฿₨₧₪₣₡₢' +
   ts_textToKeyGraphemeExtends +
   ']+', 'g' );

/*
  FIXME: add the missing whitespaces and word-separating punctuation signs.
*/
ts_textToKeyBlanked = new RegExp(
   '[\\s\\"\\[\\]\\\\ \xA0' +
   '{}()|+*/±×÷%<=>°„“”«».,;:¡!‼¿?…•_–—#&' +
   'ƧƨƻƼƽ' + /* tone letters ignored for sort */
   'ʔʡʕʢʖƾ' + /* glottal letters */
   'ǀǁǂǃʘ' + /* click letters */
   ']+', 'g' );

/*
  Note: For performance reason in browsers, the values in this map MUST be precompiled
  regexps, ready for use as parameters of "string.replace(regexp, subst)" in loops.

  TODO: add more mappings for more languages...
  This mapping is nearly complete for modern languages written in Latin.
  Should add mappings for precomposed characters in Greek, Cyrillic...
  Hangul precomposed syllables should be decomposed algorithmically into jamos.
*/
ts_textToKeyRemapped = {
  '0': /[⁰₀]/g, '1': /[¹₁]/g, '2': /[²₂]/g, '3': /[³₃]/g, '4': /[⁴₄]/g, '5': /[⁵₅]/g, '6': /[⁶₆]/g, '7': /[⁷₇]/g, '8': /[⁸₈]/g, '9': /[⁹₉]/g,
  'a': /[áàãâấầẫẩäǟȁảǎăắằẵẳȃāǡåǻẚạậặḁąɐɑɒª@]/g, 'A': /[ÁÀÃÂẤẦẪẨÄǞȀẢǍĂẮẰẴẲȂĀǠÅǺẠẬẶḀĄ]/g,
  'ae': /[æǽǣ]/g, 'AE': /[ÆǼǢ]/g,
  'b': /[ḃḅḇƀƃƅɓʙ]/g, 'B': /[ḂḄḆƁƂƄ]/g,
  'c': /[ćĉčċçḉƈɕʗɔ]/g, 'C': /[ĆĈČĊÇḈƇƆ]/g,
  'd': /[ďḋḍḑḏḓƌɗɖ]/g, 'D': /[ĎḊḌḐḎḒƉƊƋ]/g,
  'dh': /[ðđ]/g, 'DH': /[ÐĐ]/g,
  'dz': /[ʣdzdžʤʥ]/g, 'Dz': /[DzDž]/g, 'DZ': /[DZDŽ]/g,
  'e': /[éèẽêếềễểëȅẻěĕȇḗḕḛḙḝėẹệęɘəɚɛʚɜɝɞ]/g, 'E': /[ÉÈẼÊẾỀỄỂËȄẺĚĔȆḖḔĖẸỆḘḚḜĘƏƎƐ]/g,
  'f': /[ḟƒ]/g, 'F': /[ḞƑ]/g,
  'fi': /[fi]/g,
  'fl': /[fl]/g,
  'g': /[ǵĝǧğḡġģǥɠɡɢʛɣɤ/g, 'G': /[ǴĜǦĞḠĠĢǤƓƔ]/g,
  'gh': /[ƣ]/g, 'GH': /[Ƣ]/g,
  'h': /[ĥḧḣḫḥẖḩħɥɦɧʜ]/g, 'H': /[ĤḦḢḪḤḨĦ]/g,
  'hv': /[ƕ]/g,
  'i': /[íìĩîïḯǐĭȋīȉỉıḭịįɨɩɪ]/g, 'I': /[ÍÌĨÎÏḮǏĬȊĪȈỈİḬỊĮƗƖ]/g,
  'ij': /[ij]/g, 'IJ': /[IJ]/g,
  'j': /[ĵǰɟʝ]/g, 'J': /[Ĵ]/g,
  'k': /[ḱǩḳķḵƙĸʞ]/g, 'K': /[ḰǨḲĶḴƘ]/g,
  'l': /[ĺḻḷḹḽļľŀłƚɫɬɭʟƛ]/g, 'L': /[ĽḺḶḸḼĻĿŁ]/g,
  'lj': /[lj]/g, 'Lj': /[Lj]/g, 'LJ': /[LJ]/g,
  'lz': /[ɮ]/g,
  'm': /[ḿṁṃɯɰɱ]/g, 'M': /[ḾṀṂƜ]/g,
  'n': /[ńñňṅṇņṉṋʼnɲŋɳƞɴ]/g, 'N': /[ŃÑŇṄṆŅṈṊƝŊ]/g,
  'nj': /[nj]/g, 'Nj': /[Nj]/g, 'NJ': /[NJ]/g,
  'o': /[óòõṍṏôốồỗổǒöǒŏȏōṓṑőȍỏọộǫǭơớờỡởợøǿɵɷº]/g, 'O': /[ÓÒÕṌṎÔỐỒỖỔÖǑŎȎŌṒṐŐȌỎỌỘǪǬƠỚỜỠỞỢØǾƟ]/g,
  'oe': /[œɶ]/g, 'OE': /[Œ]/g,
  'p': /[ṕṗƥ]/g, 'P': /[ṔṖƤ]/g,
  'ph': /[ɸ]/g,
  'q': /[ʠ]/g,
  'r': /[ŕřȓṙȑŗṛṝṟɼɽɾɹɺɻɿʅʀʁ]/g, 'R': /[ŔŘȒṘȐŖṚṜṞƦ]/g,
  's': /[śṥŝšṧṡṣṩşʂſẛʃʄʆƪ]/g, 'S': /[ŚṤŜŠṦṠṢṨŞƩ]/g,
  'ss': /[ß]/g,
  't': /[ẗťṫṭṯṱţƫƭŧʈʇ]/g, 'T': /[ŤṪṬṮṰŢƮƬŦ]/g,
  'tc': /[ʨ]/g,
  'ts': /[ʦʧ]/g,
  'th': /[þ]/g, 'TH': /[Þ]/g,
  'u': /[úùũṹûǔüǘǜǖǚŭȗūṻůűȕủṵṳụṷųưứừữửựʉʊ]/g, 'U': /[ÚÙŨṸÛǓÜǗǛǕǙŬȖŪṺŮŰȔỦṴṲỤṶŲƯỨỪỮỬỰƱ]/g,
  'v': /[ṽṿʋʌ]/g, 'V': /[ṼṾƲ]/g,
  'w': /[ẃẁŵẅẇẘẉʍƿ]/g, 'W': /[ẂẀŴẄẆẈ]/g,
  'x': /[ẍẋ]/g, 'X': /[ẌẊ]/g,
  'y': /[ýỳŷỹÿẏỷẙỵƴʎʏ]/g, 'Y': /[ÝỲŶỸŸẎỶỴƳ]/g,
  'z': /[źẑžżẓẕƶʐʑʒǯƺʓƹ]/g, 'Z': /[ŹẐŽŻẒẔƵƷǮƸ]/g,
};

/*
  Utility for reversing the ternary key containing differences such as accents (mostly for French).

  Note that with a full UCA implementation, this should be the secondary level, before case differences.
  However we currently don't convert letters to sortable collation weights, so this would not work as
  expected, and letters with all accent would sort after all letters without accents.
  So this implementation swaps levels 2 and 3, to make sure that the same base letters with the same
  case will be sorted together. Difference of accents are left in the final level, binary-ordered here
  according to their UTF-16 code units, along with other ignorable differences.

  Ideally, this should correctly use the Grapheme Cluster Boundary Rules of UAX#29, i.e. all legacy grapheme clusters
  plus rule GB9b of extended grapheme clusters (but probably not rule GB9a here for string reversal,
  because spacing marks such as '^' or '~' should be treated isolately, just like apostrophes; note that
  these spacing marks are ignored here in the primary and secondary subkeys).

  BUG: cannot handle string reversal so that it will detect and not break all extended grapheme clusters
  that are longer than 2 characters (especially Hangul L+V+T* clusters), or longer than 1 character in
  in supplementary planes. If Hangul is handled later, these clusters should be preserved 

  As this is just used here for French,  this bug should not cause any practical
  problem for sorting actual tables.

  Don't use this function for meaningful display ! A true grapheme cluster breaker, needed for rendering
  and text layout, would have to handle all these cases (including rule GB9a or extended grapheme
  clusters)

  A more precise detection of grapheme clusters for collation would require a huge and complex map of
  collation weights (at least the DUCET).
*/
ts_textToKeyReverseGraphemeExtends = new RegExp(
  '(.)([' + '\uDC00-\uDFFF' + /* include trailing surrogates */
  ts_textToKeyGraphemeExtends + '])', 'g' );
ts_textToKeyReverseGraphemeExtends = new RegExp(
  '([' + ts_textToKeyGraphemePrepends + '])(.)', 'g' );
function ts_textToKeyReverse(s) {
  return s
    .replace(ts_textToKeyReverseGraphemeExtends, '$2$1')
    .replace(ts_textToKeyReverseGraphemePrepends, '$2$1')
    .split('').reverse().join('');
}

function ts_textToKeyFilter(s) {
  var t = s.replace(ts_textToKeyDropped, '').replace(ts_textToKeyBlanked, ' ').replace(/^ /, '').replace(/ $/, '');
  for (var k in ts_textToKeyRemapped) t = t.replace(ts_textToKeyRemapped[k], k);
  /* TODO: Hangul precomposed syllables should be decomposed algorithmically into jamos. */
  return t;
}

/*
  Main function.
*/
function ts_textToKey(s) {
  var f = ts_toOtherCase(ts_textToKeyFilter(s)), k = f.toLowerCase(), x = '\t';
  if (ts_textToKeyReversed) s = ts_textToKeyReverse(s); /* mainly for French */
  return f != s ? k + x + f + x + s : k != f ? k + x + f : k;
}

/*
  Override the existing "wikibits.js" for table sorts, when processing text cells into sort keys.
*/
ts_toLowerCase = ts_textToKey;

/* ===================================================================== */
/* </pre> */