// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file // for details. All rights reserved. Use of this source code is governed by a // BSD-style license that can be found in the LICENSE file. /// Bidi stands for Bi-directional text. According to /// http://en.wikipedia.org/wiki/Bi-directional_text: Bi-directional text is /// text containing text in both text directionalities, both right-to-left (RTL) /// and left-to-right (LTR). It generally involves text containing different /// types of alphabets, but may also refer to boustrophedon, which is changing /// text directionality in each row. library; import '../global_state.dart' as global_state; import 'text_direction.dart'; // Suppress naming issues as changing them would be breaking. // ignore_for_file: constant_identifier_names // ignore: avoid_classes_with_only_static_members /// This provides utility methods for working with bidirectional text. All /// of the methods are static, and are organized into a class primarily to /// group them together for documentation and discoverability. // TODO(alanknight): Consider a better way of organizing these. class Bidi { /// Unicode "Left-To-Right Embedding" (LRE) character. static const LRE = '\u202A'; /// Unicode "Right-To-Left Embedding" (RLE) character. static const RLE = '\u202B'; /// Unicode "Pop Directional Formatting" (PDF) character. static const PDF = '\u202C'; /// Unicode "Left-To-Right Mark" (LRM) character. static const LRM = '\u200E'; /// Unicode "Right-To-Left Mark" (RLM) character. static const RLM = '\u200F'; /// Constant to define the threshold of RTL directionality. static const _RTL_DETECTION_THRESHOLD = 0.40; /// Practical patterns to identify strong LTR and RTL characters, /// respectively. These patterns are not completely correct according to the /// Unicode standard. They are simplified for performance and small code size. static const String _LTR_CHARS = r'A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590' r'\u0800-\u1FFF\u2C00-\uFB1C\uFDFE-\uFE6F\uFEFD-\uFFFF'; static const String _RTL_CHARS = r'\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC'; /// Returns the input [text] with spaces instead of HTML tags or HTML escapes, /// which is helpful for text directionality estimation. /// Note: This function should not be used in other contexts. /// It does not deal well with many things: comments, script, /// elements, style elements, dir attribute,`>` in quoted attribute values, /// etc. But it does handle well enough the most common use cases. /// Since the worst that can happen as a result of these shortcomings is that /// the wrong directionality will be estimated, we have not invested in /// improving this. static String stripHtmlIfNeeded(String text) { // The regular expression is simplified for an HTML tag (opening or // closing) or an HTML escape. We might want to skip over such expressions // when estimating the text directionality. return text.replaceAll(RegExp(r'<[^>]*>|&[^;]+;'), ' '); } /// Determines if the first character in [text] with strong directionality is /// LTR. If [isHtml] is true, the text is HTML or HTML-escaped. static bool startsWithLtr(String text, [bool isHtml = false]) { return RegExp('^[^$_RTL_CHARS]*[$_LTR_CHARS]') .hasMatch(isHtml ? stripHtmlIfNeeded(text) : text); } /// Determines if the first character in [text] with strong directionality is /// RTL. If [isHtml] is true, the text is HTML or HTML-escaped. static bool startsWithRtl(String text, [bool isHtml = false]) { return RegExp('^[^$_LTR_CHARS]*[$_RTL_CHARS]') .hasMatch(isHtml ? stripHtmlIfNeeded(text) : text); } /// Determines if the exit directionality (ie, the last strongly-directional /// character in [text] is LTR. If [isHtml] is true, the text is HTML or /// HTML-escaped. static bool endsWithLtr(String text, [bool isHtml = false]) { return RegExp('[$_LTR_CHARS][^$_RTL_CHARS]*\$') .hasMatch(isHtml ? stripHtmlIfNeeded(text) : text); } /// Determines if the exit directionality (ie, the last strongly-directional /// character in [text] is RTL. If [isHtml] is true, the text is HTML or /// HTML-escaped. static bool endsWithRtl(String text, [bool isHtml = false]) { return RegExp('[$_RTL_CHARS][^$_LTR_CHARS]*\$') .hasMatch(isHtml ? stripHtmlIfNeeded(text) : text); } /// Determines if the given [text] has any LTR characters in it. /// If [isHtml] is true, the text is HTML or HTML-escaped. static bool hasAnyLtr(String text, [bool isHtml = false]) { return RegExp(r'[' '$_LTR_CHARS' r']') .hasMatch(isHtml ? stripHtmlIfNeeded(text) : text); } /// Determines if the given [text] has any RTL characters in it. /// If [isHtml] is true, the text is HTML or HTML-escaped. static bool hasAnyRtl(String text, [bool isHtml = false]) { return RegExp(r'[' '$_RTL_CHARS' r']') .hasMatch(isHtml ? stripHtmlIfNeeded(text) : text); } static final _rtlLocaleRegex = RegExp( r'^(ar|ckb|dv|he|iw|fa|nqo|ps|sd|ug|ur|yi|.*[-_]' r'(Arab|Hebr|Thaa|Nkoo|Tfng))(?!.*[-_](Latn|Cyrl)($|-|_))' r'($|-|_)', caseSensitive: false); static String? _lastLocaleCheckedForRtl; static bool? _lastRtlCheck; /// Check if a BCP 47 / III [languageString] indicates an RTL language. /// /// i.e. either: /// - a language code explicitly specifying one of the right-to-left scripts, /// e.g. "az-Arab", or /// - a language code specifying one of the languages normally written in a /// right-to-left script, e.g. "fa" (Farsi), except ones explicitly /// specifying Latin or Cyrillic script (which are the usual LTR /// alternatives). /// /// The list of right-to-left scripts appears in the 100-199 range in /// http://www.unicode.org/iso15924/iso15924-num.html, of which Arabic and /// Hebrew are by far the most widely used. We also recognize Thaana, N'Ko, /// and Tifinagh, which also have significant modern usage. The rest (Syriac, /// Samaritan, Mandaic, etc.) seem to have extremely limited or no modern /// usage and are not recognized. The languages usually written in a /// right-to-left script are taken as those with Suppress-Script: /// Hebr|Arab|Thaa|Nkoo|Tfng in /// http://www.iana.org/assignments/language-subtag-registry, as well as /// Sindhi (sd) and Uyghur (ug). The presence of other subtags of the /// language code, e.g. regions like EG (Egypt), is ignored. static bool isRtlLanguage([String? languageString]) { var language = languageString ?? global_state.getCurrentLocale(); if (_lastLocaleCheckedForRtl != language) { _lastLocaleCheckedForRtl = language; _lastRtlCheck = _rtlLocaleRegex.hasMatch(language); } return _lastRtlCheck!; } /// Enforce the [html] snippet in RTL directionality regardless of overall /// context. If the html piece was enclosed by a tag, the direction will be /// applied to existing tag, otherwise a span tag will be added as wrapper. /// For this reason, if html snippet start with with tag, this tag must /// enclose the whole piece. If the tag already has a direction specified, /// this new one will override existing one in behavior (should work on /// Chrome, FF, and IE since this was ported directly from the Closure /// version). static String enforceRtlInHtml(String html) => _enforceInHtmlHelper(html, 'rtl'); /// Enforce RTL on both end of the given [text] using unicode BiDi formatting /// characters RLE and PDF. static String enforceRtlInText(String text) => '$RLE$text$PDF'; /// Enforce the [html] snippet in LTR directionality regardless of overall /// context. If the html piece was enclosed by a tag, the direction will be /// applied to existing tag, otherwise a span tag will be added as wrapper. /// For this reason, if html snippet start with with tag, this tag must /// enclose the whole piece. If the tag already has a direction specified, /// this new one will override existing one in behavior (tested on FF and IE). static String enforceLtrInHtml(String html) => _enforceInHtmlHelper(html, 'ltr'); /// Enforce LTR on both end of the given [text] using unicode BiDi formatting /// characters LRE and PDF. static String enforceLtrInText(String text) => '$LRE$text$PDF'; /// Enforce the [html] snippet in the desired [direction] regardless of /// overall context. If the html piece was enclosed by a tag, the direction /// will be applied to existing tag, otherwise a span tag will be added as /// wrapper. For this reason, if html snippet start with with tag, this tag /// must enclose the whole piece. If the tag already has a direction /// specified, this new one will override existing one in behavior (tested on /// FF and IE). static String _enforceInHtmlHelper(String html, String direction) { if (html.startsWith('<')) { var buffer = StringBuffer(); var startIndex = 0; var match = RegExp('<\\w+').firstMatch(html); if (match != null) { buffer ..write(html.substring(startIndex, match.end)) ..write(' dir=$direction'); startIndex = match.end; } return (buffer..write(html.substring(startIndex))).toString(); } // '\n' is important for FF so that it won't incorrectly merge span groups. return '\n$html'; } /// Apply bracket guard to [str] using html span tag. This is to address the /// problem of messy bracket display that frequently happens in RTL layout. /// If [isRtlContext] is true, then we explicitly want to wrap in a span of /// RTL directionality, regardless of the estimated directionality. static String guardBracketInHtml(String str, [bool? isRtlContext]) { var useRtl = isRtlContext ?? hasAnyRtl(str); var matchingBrackets = RegExp(r'(\(.*?\)+)|(\[.*?\]+)|(\{.*?\}+)|(<.*?(>)+)'); return _guardBracketHelper(str, matchingBrackets, '', ''); } /// Apply bracket guard to [str] using LRM and RLM. This is to address the /// problem of messy bracket display that frequently happens in RTL layout. /// This version works for both plain text and html, but in some cases is not /// as good as guardBracketInHtml. If [isRtlContext] is true, then we /// explicitly want to wrap in a span of RTL directionality, regardless of the /// estimated directionality. static String guardBracketInText(String str, [bool? isRtlContext]) { var useRtl = isRtlContext ?? hasAnyRtl(str); var mark = useRtl ? RLM : LRM; return _guardBracketHelper( str, RegExp(r'(\(.*?\)+)|(\[.*?\]+)|(\{.*?\}+)|(<.*?>+)'), mark, mark); } /// (Mostly) reimplements the $& functionality of "replace" in JavaScript. /// Given a [str] and the [regexp] to match with, optionally supply a string /// to be inserted [before] the match and/or [after]. For example, /// `_guardBracketHelper('firetruck', new RegExp('truck'), 'hydrant', '!')` /// would return 'firehydrant!'. // TODO(efortuna): Get rid of this once this /// is implemented in Dart. // See Issue 2979. static String _guardBracketHelper(String str, RegExp regexp, [String? before, String? after]) { var buffer = StringBuffer(); var startIndex = 0; for (var match in regexp.allMatches(str)) { buffer ..write(str.substring(startIndex, match.start)) ..write(before) ..write(str.substring(match.start, match.end)) ..write(after); startIndex = match.end; } return (buffer..write(str.substring(startIndex))).toString(); } /// Estimates the directionality of [text] using the best known /// general-purpose method (using relative word counts). A /// TextDirection.UNKNOWN return value indicates completely neutral input. /// [isHtml] is true if [text] HTML or HTML-escaped. /// /// If the number of RTL words is above a certain percentage of the total /// number of strongly directional words, returns RTL. /// Otherwise, if any words are strongly or weakly LTR, returns LTR. /// Otherwise, returns UNKNOWN, which is used to mean `neutral`. /// Numbers and URLs are counted as weakly LTR. static TextDirection estimateDirectionOfText(String text, {bool isHtml = false}) { text = isHtml ? stripHtmlIfNeeded(text) : text; var rtlCount = 0; var total = 0; var hasWeaklyLtr = false; // Split a string into 'words' for directionality estimation based on // relative word counts. for (var token in text.split(RegExp(r'\s+'))) { if (startsWithRtl(token)) { rtlCount++; total++; } else if (RegExp(r'^http://').hasMatch(token)) { // Checked if token looks like something that must always be LTR even in // RTL text, such as a URL. hasWeaklyLtr = true; } else if (hasAnyLtr(token)) { total++; } else if (RegExp(r'\d').hasMatch(token)) { // Checked if token contains any numerals. hasWeaklyLtr = true; } } if (total == 0) { return hasWeaklyLtr ? TextDirection.LTR : TextDirection.UNKNOWN; } else if (rtlCount > _RTL_DETECTION_THRESHOLD * total) { return TextDirection.RTL; } else { return TextDirection.LTR; } } /// Replace the double and single quote directly after a Hebrew character in /// [str] with GERESH and GERSHAYIM. This is most likely the user's intention. static String normalizeHebrewQuote(String str) { var buf = StringBuffer(); if (str.isNotEmpty) { buf.write(str.substring(0, 1)); } // Start at 1 because we're looking for the patterns [\u0591-\u05f2])" or // [\u0591-\u05f2]'. for (var i = 1; i < str.length; i++) { if (str.substring(i, i + 1) == '"' && RegExp('[\u0591-\u05f2]').hasMatch(str.substring(i - 1, i))) { buf.write('\u05f4'); } else if (str.substring(i, i + 1) == "'" && RegExp('[\u0591-\u05f2]').hasMatch(str.substring(i - 1, i))) { buf.write('\u05f3'); } else { buf.write(str.substring(i, i + 1)); } } return buf.toString(); } /// Check the estimated directionality of [str], return true if the piece of /// text should be laid out in RTL direction. If [isHtml] is true, the string /// is HTML or HTML-escaped. static bool detectRtlDirectionality(String str, {bool isHtml = false}) => estimateDirectionOfText(str, isHtml: isHtml) == TextDirection.RTL; }