Enable to recognize most kinds of characters as URL paths (#4941)

2025-03-17 08:50:54 -04:00 · 2017-09-15 01:03:20 +09:00 · 2017-09-15 01:03:20 +09:00 · 3816943e6b
commit 3816943e6b
parent b39d512ade
5 changed files with 96 additions and 5 deletions
--- a/app/lib/formatter.rb
+++ b/app/lib/formatter.rb
@ -131,7 +131,7 @@ class Formatter
  end

  def link_html(url)
-    url    = Addressable::URI.parse(url).display_uri.to_s
+    url    = Addressable::URI.parse(url).to_s
    prefix = url.match(/\Ahttps?:\/\/(www\.)?/).to_s
    text   = url[prefix.length, 30]
    suffix = url[prefix.length + 30..-1]
--- a/app/services/fetch_link_card_service.rb
+++ b/app/services/fetch_link_card_service.rb
@ -1,9 +1,15 @@
 # frozen_string_literal: true

 class FetchLinkCardService < BaseService
-  include ActionView::Helpers::TagHelper
-
-  URL_PATTERN = %r{https?://\S+}
+  URL_PATTERN = %r{
+    (                                                                                                 #   $1 URL
+      (https?:\/\/)?                                                                                  #   $2 Protocol (optional)
+      (#{Twitter::Regex[:valid_domain]})                                                              #   $3 Domain(s)
+      (?::(#{Twitter::Regex[:valid_port_number]}))?                                                   #   $4 Port number (optional)
+      (/#{Twitter::Regex[:valid_url_path]}*)?                                                         #   $5 URL Path and anchor
+      (\?#{Twitter::Regex[:valid_url_query_chars]}*#{Twitter::Regex[:valid_url_query_ending_chars]})? #   $6 Query String
+    )
+  }iox

  def call(status)
    @status = status
@ -42,7 +48,7 @@ class FetchLinkCardService < BaseService

  def parse_urls
    if @status.local?
-      urls = @status.text.match(URL_PATTERN).to_a.map { |uri| Addressable::URI.parse(uri).normalize }
+      urls = @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[0]).normalize }
    else
      html  = Nokogiri::HTML(@status.text)
      links = html.css('a')
--- a/config/initializers/twitter_regex.rb
+++ b/config/initializers/twitter_regex.rb
@ -0,0 +1,42 @@
+module Twitter
+  class Regex
+
+    REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou
+    REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]\p{Pd}_~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
+    REGEXEN[:valid_url_balanced_parens] = /
+      \(
+        (?:
+          #{REGEXEN[:valid_general_url_path_chars]}+
+          |
+          # allow one nested level of balanced parentheses
+          (?:
+            #{REGEXEN[:valid_general_url_path_chars]}*
+            \(
+              #{REGEXEN[:valid_general_url_path_chars]}+
+            \)
+            #{REGEXEN[:valid_general_url_path_chars]}*
+          )
+        )
+      \)
+    /iox
+    REGEXEN[:valid_url_path] = /(?:
+      (?:
+        #{REGEXEN[:valid_general_url_path_chars]}*
+        (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
+        #{REGEXEN[:valid_url_path_ending_chars]}
+      )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
+    )/iox
+    REGEXEN[:valid_url] = %r{
+      (                                                                                     #   $1 total match
+        (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceeding chracter
+        (                                                                                   #   $3 URL
+          (https?:\/\/)?                                                                    #   $4 Protocol (optional)
+          (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
+          (?::(#{REGEXEN[:valid_port_number]}))?                                            #   $6 Port number (optional)
+          (/#{REGEXEN[:valid_url_path]}*)?                                                  #   $7 URL Path and anchor
+          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $8 Query String
+        )
+      )
+    }iox
+  end
+end
--- a/spec/lib/formatter_spec.rb
+++ b/spec/lib/formatter_spec.rb
@ -89,6 +89,38 @@ RSpec.describe Formatter do
      end
    end

+    context 'matches a URL with Japanese path string' do
+      let(:text) { 'https://ja.wikipedia.org/wiki/日本' }
+
+      it 'has valid URL' do
+        is_expected.to include 'href="https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC"'
+      end
+    end
+
+    context 'matches a URL with Korean path string' do
+      let(:text) { 'https://ko.wikipedia.org/wiki/대한민국' }
+
+      it 'has valid URL' do
+        is_expected.to include 'href="https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD"'
+      end
+    end
+
+    context 'matches a URL with Simplified Chinese path string' do
+      let(:text) { 'https://baike.baidu.com/item/中华人民共和国' }
+
+      it 'has valid URL' do
+        is_expected.to include 'href="https://baike.baidu.com/item/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD"'
+      end
+    end
+
+    context 'matches a URL with Traditional Chinese path string' do
+      let(:text) { 'https://zh.wikipedia.org/wiki/臺灣' }
+
+      it 'has valid URL' do
+        is_expected.to include 'href="https://zh.wikipedia.org/wiki/%E8%87%BA%E7%81%A3"'
+      end
+    end
+
    context 'contains HTML (script tag)' do
      let(:text) { '<script>alert("Hello")</script>' }

--- a/spec/services/fetch_link_card_service_spec.rb
+++ b/spec/services/fetch_link_card_service_spec.rb
@ -12,6 +12,8 @@ RSpec.describe FetchLinkCardService do
    stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt'))
    stub_request(:head, 'http://example.com/koi8-r').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
    stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt'))
+    stub_request(:head, 'http://example.com/日本語').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
+    stub_request(:get, 'http://example.com/日本語').to_return(request_fixture('sjis.txt'))
    stub_request(:head, 'https://github.com/qbi/WannaCry').to_return(status: 404)

    subject.call(status)
@ -52,6 +54,15 @@ RSpec.describe FetchLinkCardService do
        expect(status.preview_cards.first.title).to eq("Московя начинаетъ только въ XVI ст. привлекать внимане иностранцевъ.")
      end
    end
+
+    context do
+      let(:status) { Fabricate(:status, text: 'テストhttp://example.com/日本語') }
+
+      it 'works with Japanese path string' do
+        expect(a_request(:get, 'http://example.com/日本語')).to have_been_made.at_least_once
+        expect(status.preview_cards.first.title).to eq("SJISのページ")
+      end
+    end
  end

  context 'in a remote status' do