tootlab-mastodon/config/initializers/twitter_regex.rb

module Twitter::TwitterText
  class Configuration
    def emoji_parsing_enabled
      false
    end
  end

  class Regex
    REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
    REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
    REGEXEN[:valid_url_balanced_parens] = /
      \(
        (?:
          #{REGEXEN[:valid_general_url_path_chars]}+
          |
          # allow one nested level of balanced parentheses
          (?:
            #{REGEXEN[:valid_general_url_path_chars]}*
            \(
              #{REGEXEN[:valid_general_url_path_chars]}+
            \)
            #{REGEXEN[:valid_general_url_path_chars]}*
          )
        )
      \)
    /iox
    UCHARS = '\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}'
    REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@#{UCHARS}]/iou
    REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-#{UCHARS}]/iou
    REGEXEN[:valid_url_path] = /(?:
      (?:
        #{REGEXEN[:valid_general_url_path_chars]}*
        (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
        #{REGEXEN[:valid_url_path_ending_chars]}
      )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
    )/iox
    REGEXEN[:valid_url] = %r{
      (                                                                                     #   $1 total match
        (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceding character
        (                                                                                   #   $3 URL
          ((?:https?|dat|dweb|ipfs|ipns|ssb|gopher|gemini):\/\/)?                           #   $4 Protocol (optional)
          (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
          (?::(#{REGEXEN[:valid_port_number]}))?                                            #   $6 Port number (optional)
          (/#{REGEXEN[:valid_url_path]}*)?                                                  #   $7 URL Path and anchor
          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $8 Query String
        )
      )
    }iox
    REGEXEN[:validate_nodeid] = /(?:
      #{REGEXEN[:validate_url_unreserved]}|
      #{REGEXEN[:validate_url_pct_encoded]}|
      [!$()*+,;=]
    )/iox
    REGEXEN[:validate_resid] = /(?:
      #{REGEXEN[:validate_url_unreserved]}|
      #{REGEXEN[:validate_url_pct_encoded]}|
      #{REGEXEN[:validate_url_sub_delims]}
    )/iox
    REGEXEN[:valid_extended_uri] = %r{
      (                                                                                 #   $1 total match
        (#{REGEXEN[:valid_url_preceding_chars]})                                        #   $2 Preceding character
        (                                                                               #   $3 URL
          (
            (xmpp:)                                                                           # Protocol
            (//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)?                     # Authority (optional)
            (#{REGEXEN[:validate_nodeid]}+@)?                                                 # Username in path (optional)
            (#{REGEXEN[:valid_domain]})                                                       # Domain in path
            (/#{REGEXEN[:validate_resid]}+)?                                                  # Resource in path (optional)
            (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # Query String
          ) | (
            (magnet:)                                                                         # Protocol
            (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})  # Query String
          )
        )
      )
    }iox
  end

  module Extractor
    # Extracts a list of all XMPP and magnet URIs included in the Toot <tt>text</tt> along
    # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
    # XMPP or magnet URIs an empty array will be returned.
    #
    # If a block is given then it will be called for each XMPP URI.
    def extract_extra_uris_with_indices(text, _options = {}) # :yields: uri, start, end
      return [] unless text && text.index(":")
      urls = []

      text.to_s.scan(Twitter::TwitterText::Regex[:valid_extended_uri]) do
        valid_uri_match_data = $~

        start_position = valid_uri_match_data.char_begin(3)
        end_position = valid_uri_match_data.char_end(3)

        urls << {
          :url => valid_uri_match_data[3],
          :indices => [start_position, end_position]
        }
      end
      urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
      urls
    end
  end
end
Update twitter-text from 1.14 to 3.1.0 and fix toot character counting (#15382) * Update twitter-text from 1.14 to 3.1.0 * Disable emoji parsing * Properly depend on twitter-text for url detection * Fix some URLs being wrongly detected client-side * Add test for server-side validation of non-autolinkable URLs * Fix server-side status length counting 2021-03-02 06:02:56 -05:00			`module Twitter::TwitterText`
			`class Configuration`
			`def emoji_parsing_enabled`
			`false`
			`end`
			`end`

Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 12:03:20 -04:00			`class Regex`
Fix URL linkifier grabbing full-width spaces and quotations (#9997) Fix #9993 Fix #5654 2019-02-09 14:13:11 -05:00			`REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou`
			`REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\\|@]\|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 12:03:20 -04:00			`REGEXEN[:valid_url_balanced_parens] = /`
			`\(`
			`(?:`
			`#{REGEXEN[:valid_general_url_path_chars]}+`
			`\|`
			`# allow one nested level of balanced parentheses`
			`(?:`
			`#{REGEXEN[:valid_general_url_path_chars]}*`
			`\(`
			`#{REGEXEN[:valid_general_url_path_chars]}+`
			`\)`
			`#{REGEXEN[:valid_general_url_path_chars]}*`
			`)`
			`)`
			`\)`
			`/iox`
Minor memory optimizations (#16507) Reduce constant memory usage by ~100kB and further reduce boot-up memory allocations and temporary memory use by a further ~200kB. 2021-10-14 15:04:57 -04:00			`UCHARS = '\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}'`
			`REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~\|@#{UCHARS}]/iou`
			`REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-#{UCHARS}]/iou`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 12:03:20 -04:00			`REGEXEN[:valid_url_path] = /(?:`
			`(?:`
			`#{REGEXEN[:valid_general_url_path_chars]}*`
			`(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]})`
			`#{REGEXEN[:valid_url_path_ending_chars]}`
			`)\|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)`
			`)/iox`
			`REGEXEN[:valid_url] = %r{`
			`( # $1 total match`
Misc. typos (#8694) Found via `codespell -q 3 --skip="./app/javascript/mastodon/locales,./config/locales"` 2018-09-13 18:53:09 -04:00			`(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 12:03:20 -04:00			`( # $3 URL`
Add support for Gemini urls (#15013) This PR updates the `valid_url` regex and sanitizer allowlist to provide support for Gemini urls. Closes #14991 2020-10-19 11:02:13 -04:00			`((?:https?\|dat\|dweb\|ipfs\|ipns\|ssb\|gopher\|gemini):\/\/)? # $4 Protocol (optional)`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 12:03:20 -04:00			`(#{REGEXEN[:valid_domain]}) # $5 Domain(s)`
			`(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)`
			`(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor`
			`(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String`
			`)`
			`)`
			`}iox`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-10 20:15:25 -05:00			`REGEXEN[:validate_nodeid] = /(?:`
			`#{REGEXEN[:validate_url_unreserved]}\|`
			`#{REGEXEN[:validate_url_pct_encoded]}\|`
			`[!$()*+,;=]`
			`)/iox`
			`REGEXEN[:validate_resid] = /(?:`
			`#{REGEXEN[:validate_url_unreserved]}\|`
			`#{REGEXEN[:validate_url_pct_encoded]}\|`
			`#{REGEXEN[:validate_url_sub_delims]}`
			`)/iox`
Add support for magnet: URIs (#12905) 2020-01-23 15:27:26 -05:00			`REGEXEN[:valid_extended_uri] = %r{`
			`( # $1 total match`
			`(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character`
			`( # $3 URL`
Minor memory optimizations (#16507) Reduce constant memory usage by ~100kB and further reduce boot-up memory allocations and temporary memory use by a further ~200kB. 2021-10-14 15:04:57 -04:00			`(`
			`(xmpp:) # Protocol`
			`(//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)? # Authority (optional)`
			`(#{REGEXEN[:validate_nodeid]}+@)? # Username in path (optional)`
			`(#{REGEXEN[:valid_domain]}) # Domain in path`
			`(/#{REGEXEN[:validate_resid]}+)? # Resource in path (optional)`
			`(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # Query String`
			`) \| (`
			`(magnet:) # Protocol`
			`(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]}) # Query String`
			`)`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-10 20:15:25 -05:00			`)`
			`)`
			`}iox`
			`end`

			`module Extractor`
Add support for magnet: URIs (#12905) 2020-01-23 15:27:26 -05:00			`# Extracts a list of all XMPP and magnet URIs included in the Toot <tt>text</tt> along`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-10 20:15:25 -05:00			`# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no`
Add support for magnet: URIs (#12905) 2020-01-23 15:27:26 -05:00			`# XMPP or magnet URIs an empty array will be returned.`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-10 20:15:25 -05:00			`#`
			`# If a block is given then it will be called for each XMPP URI.`
Fixed code quality issues (#15541) * Added .deepsource.toml * Removed bad use of `alias` * Fixed operand order in the binary expression * Prefixed unused method arguments with an underscore * Replaced the old OpenSSL algorithmic constants with the newer strings initializers. * Removed unnecessary UTF-8 encoding comment 2021-01-31 15:26:09 -05:00			`def extract_extra_uris_with_indices(text, _options = {}) # :yields: uri, start, end`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-10 20:15:25 -05:00			`return [] unless text && text.index(":")`
			`urls = []`

Update twitter-text from 1.14 to 3.1.0 and fix toot character counting (#15382) * Update twitter-text from 1.14 to 3.1.0 * Disable emoji parsing * Properly depend on twitter-text for url detection * Fix some URLs being wrongly detected client-side * Add test for server-side validation of non-autolinkable URLs * Fix server-side status length counting 2021-03-02 06:02:56 -05:00			`text.to_s.scan(Twitter::TwitterText::Regex[:valid_extended_uri]) do`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-10 20:15:25 -05:00			`valid_uri_match_data = $~`

			`start_position = valid_uri_match_data.char_begin(3)`
			`end_position = valid_uri_match_data.char_end(3)`

			`urls << {`
			`:url => valid_uri_match_data[3],`
			`:indices => [start_position, end_position]`
			`}`
			`end`
			`urls.each{\|url\| yield url[:url], url[:indices].first, url[:indices].last} if block_given?`
			`urls`
			`end`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 12:03:20 -04:00			`end`
			`end`