File: C:/Ruby27-x64/lib/ruby/gems/2.7.0/gems/net-imap-0.3.4/rakelib/string_prep_tables_generator.rb
# frozen_string_literal: true
# Generator for stringprep regexps.
#
# Combines Unicode character classes with generated tables. Generated regexps
# are still used to test that the written regexps conform to the specification.
# Some tables don't match up well with any character properties available to
# ruby's regexp engine. Those use the table-generated regexps.
class StringPrepTablesGenerator
STRINGPREP_RFC_FILE = "rfcs/rfc3454.txt"
STRINGPREP_JSON_FILE = "rfcs/rfc3454-stringprep_tables.json"
# valid UTF-8 can't contain these codepoints
# checking for them anyway, using /\p{Cs}/ ;)
SURROGATES_RANGE = 0xD800..0xDFFF
attr_reader :json_filename, :rfc_filename
def initialize(rfc_filename: STRINGPREP_RFC_FILE,
json_filename: STRINGPREP_JSON_FILE)
@rfc_filename = rfc_filename
@json_filename = json_filename
end
# for rake deps
def json_deps; Rake::FileList.new __FILE__, STRINGPREP_RFC_FILE end
def rb_deps; Rake::FileList.new __FILE__, STRINGPREP_JSON_FILE end
def clean_deps; Rake::FileList.new STRINGPREP_JSON_FILE end
def generate_json_data_file
require "json"
rfc_filename
.then(&File.method(:read))
.then(&method(:parse_rfc_text))
.then(&JSON.method(:pretty_generate))
.then {|data| File.write json_filename, data }
end
def tables; @tables ||= load_tables_and_titles_from_json!.first end
def titles; @titles ||= load_tables_and_titles_from_json!.last end
def ranges; @ranges ||= tables.transform_values(&method(:to_ranges)) end
def arrays; @arrays ||= ranges.transform_values{|t| t.flat_map(&:to_a) } end
def sets; @sets ||= arrays.transform_values(&:to_set) end
def regexps; @regexps ||= arrays.transform_values(&method(:to_regexp)) end
def asgn_regexps; @asgn_regexps || asgn_regexps! end
def merged_tables_regex(*table_names, negate: false)
table_names
.flat_map(&arrays.method(:fetch))
.then {|array| to_regexp(array, negate: negate) }
end
def regexp_for(*names, negate: false)
asgn_regexps[[*names, negate]] ||= merged_tables_regex(*names, negate: negate)
end
def stringprep_rb
<<~RUBY
# frozen_string_literal: true
#--
# This file is generated from RFC3454, by rake. Don't edit directly.
#++
module Net::IMAP::SASL
module StringPrep
#{asgn_table "A.1"}
#{asgn_table "B.1"}
#{asgn_table "B.2"}
#{asgn_table "B.3"}
#{asgn_table "C.1.1"}
#{asgn_table "C.1.2"}
#{asgn_table "C.2.1"}
#{asgn_table "C.2.2"}
#{asgn_table "C.3"}
#{asgn_table "C.4"}
#{asgn_table "C.5"}
#{asgn_table "C.6"}
#{asgn_table "C.7"}
#{asgn_table "C.8"}
#{asgn_table "C.9"}
#{asgn_table "D.1"}
# Used to check req3 of bidirectional checks
#{asgn_table "D.1", negate: true}
#{asgn_table "D.2"}
BIDI_DESC_REQ2 = "A string with RandALCat characters must not contain LCat characters."
# Bidirectional Characters [StringPrep, §6], Requirement 2::
# If a string contains any RandALCat character, the string MUST NOT
# contain any LCat character.
BIDI_FAILS_REQ2 = #{bidi_fails_req2.inspect}.freeze
BIDI_DESC_REQ3 = "A string with RandALCat characters must start and end with RandALCat characters."
# Bidirectional Characters [StringPrep, §6], Requirement 3::
# If a string contains any RandALCat character, a RandALCat
# character MUST be the first character of the string, and a
# RandALCat character MUST be the last character of the string.
BIDI_FAILS_REQ3 = #{bidi_fails_req3.inspect}.freeze
# Bidirectional Characters [StringPrep, §6]
BIDI_FAILURE = #{bidi_failure_regexp.inspect}.freeze
# Names of each codepoint table in the RFC-3454 appendices
TABLE_TITLES = {
#{table_titles_rb}
}.freeze
# Regexps matching each codepoint table in the RFC-3454 appendices
TABLE_REGEXPS = {
#{table_regexps_rb}
}.freeze
end
end
RUBY
end
def table_titles_rb(indent = 3)
titles
.map{|t| "%p => %p," % t }
.join("\n#{" "*indent}")
end
def table_regexps_rb(indent = 3)
asgn_regexps # => { ["A.1", false] => regexp, ... }
.reject {|(_, n), _| n }
.map {|(t, _), _| "%p => %s," % [t, regexp_const_name(t)] }
.join("\n#{" "*indent}")
end
def saslprep_rb
<<~RUBY
# frozen_string_literal: true
#--
# This file is generated from RFC3454, by rake. Don't edit directly.
#++
module Net::IMAP::SASL
module SASLprep
# RFC4013 §2.1 Mapping - mapped to space
# * non-ASCII space characters (\\StringPrep\\[\\"C.1.2\\"]) that can be
# mapped to SPACE (U+0020), and
#
# Equal to \\StringPrep\\[\\"C.1.2\\"].
# Redefined here to avoid loading the StringPrep module.
MAP_TO_SPACE = #{regex_str "C.1.2"}
# RFC4013 §2.1 Mapping - mapped to nothing
# the "commonly mapped to nothing" characters (\\StringPrep\\[\\"B.1\\"])
# that can be mapped to nothing.
#
# Equal to \\StringPrep\\[\\"B.1\\"].
# Redefined here to avoid loading the StringPrep module.
MAP_TO_NOTHING = #{regex_str "B.1"}
# RFC4013 §2.3 Prohibited Output::
# * Non-ASCII space characters — \\StringPrep\\[\\"C.1.2\\"]
# * ASCII control characters — \\StringPrep\\[\\"C.2.1\\"]
# * Non-ASCII control characters — \\StringPrep\\[\\"C.2.2\\"]
# * Private Use characters — \\StringPrep\\[\\"C.3\\"]
# * Non-character code points — \\StringPrep\\[\\"C.4\\"]
# * Surrogate code points — \\StringPrep\\[\\"C.5\\"]
# * Inappropriate for plain text characters — \\StringPrep\\[\\"C.6\\"]
# * Inappropriate for canonical representation characters — \\StringPrep\\[\\"C.7\\"]
# * Change display properties or deprecated characters — \\StringPrep\\[\\"C.8\\"]
# * Tagging characters — \\StringPrep\\[\\"C.9\\"]
TABLES_PROHIBITED = #{SASL_TABLES_PROHIBITED.inspect}.freeze
# Adds unassigned (by Unicode 3.2) codepoints to TABLES_PROHIBITED.
#
# RFC4013 §2.5 Unassigned Code Points::
# This profile specifies the \\StringPrep\\[\\"A.1\\"] table as its list of
# unassigned code points.
TABLES_PROHIBITED_STORED = ["A.1", *TABLES_PROHIBITED].freeze
# Matches codepoints prohibited by RFC4013 §2.3.
#
# See TABLES_PROHIBITED.
#
# Equal to +Regexp.union+ of the TABLES_PROHIBITED tables. Redefined
# here to avoid loading the StringPrep module unless necessary.
PROHIBITED_OUTPUT = #{regex_str(*SASL_TABLES_PROHIBITED)}
# RFC4013 §2.5 Unassigned Code Points::
# This profile specifies the \\StringPrep\\[\\"A.1\\"] table as its list of
# unassigned code points.
UNASSIGNED = #{regex_str "A.1"}
# Matches codepoints prohibited by RFC4013 §2.3 and §2.5.
#
# See TABLES_PROHIBITED_STORED.
PROHIBITED_OUTPUT_STORED = Regexp.union(
UNASSIGNED, PROHIBITED_OUTPUT
).freeze
# Bidirectional Characters [StringPrep, §6]
BIDI_FAILURE = #{bidi_failure_regexp.inspect}.freeze
# Matches strings prohibited by RFC4013 §2.3 and §2.4.
#
# This checks prohibited output and bidirectional characters.
PROHIBITED = Regexp.union(
PROHIBITED_OUTPUT, BIDI_FAILURE,
)
# Matches strings prohibited by RFC4013 §2.3, §2.4, and §2.5.
#
# This checks prohibited output, bidirectional characters, and
# unassigned codepoints.
PROHIBITED_STORED = Regexp.union(
PROHIBITED_OUTPUT_STORED, BIDI_FAILURE,
)
end
end
RUBY
end
private
def parse_rfc_text(rfc3454_text)
titles = {}
tables, = rfc3454_text
.lines
.each_with_object([]) {|line, acc|
current, table = acc.last
case line
when /^([A-D]\.[1-9](?:\.[1-9])?) (.*)/
titles[$1] = $2
when /^ {3}-{5} Start Table (\S*)/
acc << [$1, []]
when /^ {3}-{5} End Table /
acc << [nil, nil]
when /^ {3}([0-9A-F]+); ([ 0-9A-F]*)(?:;[^;]*)$/ # mapping tables
table << [$1, $2.split(/ +/)] if current
when /^ {3}([-0-9A-F]+)(?:;[^;]*)?$/ # regular tables
table << $1 if current
when /^ {3}(.*)/
raise "expected to match %p" % $1 if current
end
}
.to_h.compact
.transform_values {|t| t.first.size == 2 ? t.to_h : t }
tables["titles"] = titles
tables
end
def load_tables_and_titles_from_json!
require "json"
@tables = json_filename
.then(&File.method(:read))
.then(&JSON.method(:parse))
@titles = @tables.delete "titles"
[@tables, @titles]
end
def to_ranges(table)
(table.is_a?(Hash) ? table.keys : table)
.map{|range| range.split(?-).map{|cp| Integer cp, 16} }
.map{|s,e| s..(e || s)}
end
# Starting from a codepoints array (rather than ranges) to deduplicate merged
# tables.
def to_regexp(codepoints, negate: false)
codepoints
.grep_v(SURROGATES_RANGE) # remove surrogate codepoints from C.5 and D.2
.uniq
.sort
.chunk_while {|cp1,cp2| cp1 + 1 == cp2 } # find contiguous chunks
.map {|chunk| chunk.map{|cp| "%04x" % cp } } # convert to hex strings
.partition {|chunk| chunk[1] } # ranges vs singles
.then {|ranges, singles|
singles.flatten!
[
negate ? "^" : "",
singles.flatten.any? ? "\\u{%s}" % singles.join(" ") : "",
ranges.map {|r| "\\u{%s}-\\u{%s}" % [r.first, r.last] }.join,
codepoints.any?(SURROGATES_RANGE) ? "\\p{Cs}" : "", # not necessary :)
].join
}
.then {|char_class| Regexp.new "[#{char_class}]" }
end
def asgn_regexps!
@asgn_regexps = {}
# preset the regexp for each table
asgn_regex "A.1", /\p{^AGE=3.2}/
# If ruby supported all unicode properties (i.e. line break = word joiner):
# /[\u{00ad 034f 1806}\p{join_c}\p{VS}\p{lb=WJ}&&\p{age=3.2}]/
asgn_table "B.1"
asgn_table "B.2"
asgn_table "B.3"
asgn_regex "C.1.1", / /
asgn_regex "C.1.2", /[\u200b\p{Zs}&&[^ ]]/
asgn_regex "C.2.1", /[\x00-\x1f\x7f]/
# C.2.2 is a union:
# Cc + Cf (as defined by Unicode 3.2) + Zl + Zp + 0xfffc
# - any codepoints covered by C.2.1 or C.8 or C.9
#
# But modern Unicode properties are significantly different, so it's better
# to just load the table definition.
asgn_table "C.2.2"
asgn_regex "C.3", /\p{private use}/
asgn_regex "C.4", /\p{noncharacter code point}/
asgn_regex "C.5", /\p{surrogate}/
asgn_regex "C.6", /[\p{in specials}&&\p{AGE=3.2}&&\p{^NChar}]/
asgn_regex "C.7", /[\p{in ideographic description characters}&&\p{AGE=3.2}]/
# C.8 is a union of \p{Bidi Control} and Unicode 3.2 properties. But those properties
# have changed for modern Unicode, and thus for modern ruby's regexp
# character properties. It's better to just load the table definition.
asgn_table "C.8"
asgn_regex "C.9", /[\p{in Tags}&&\p{AGE=3.2}]/
# Unfortunately, ruby doesn't (currently) support /[\p{Bidi
# Class=R}\p{bc=AL}]/. On the other hand, StringPrep (based on Unicode 3.2)
# might not be a good match for the modern (14.0) property value anyway.
asgn_table "D.1"
asgn_table "D.1", negate: true # used by BIDI_FAILS_REQ3
asgn_table "D.2"
@asgn_regexps
end
def regex_str(*names, negate: false)
"%p.freeze" % regexp_for(*names, negate: negate)
end
def asgn_table(name, negate: false)
asgn_regex(name, regexp_for(name, negate: negate), negate: negate)
end
def regexp_const_desc(name, negate: false)
if negate then "Matches the negation of the %s table" % [name]
else %q{%s \\StringPrep\\[\\"%s\\"]} % [titles.fetch(name), name]
end
end
def regexp_const_name(table_name, negate: false)
"IN_%s%s" % [table_name.tr(".", "_"), negate ? "_NEGATED" : ""]
end
def asgn_regex(name, regexp, negate: false)
asgn_regexps[[name, negate]] = regexp
"# %s\n%s%s = %p.freeze" % [
regexp_const_desc(name, negate: negate), " " * 4,
regexp_const_name(name, negate: negate),
regexp,
]
end
def bidi_R_AL ; regexp_for "D.1" end
def bidi_not_R_AL ; regexp_for "D.1", negate: true end
def bidi_L ; regexp_for "D.2" end
def bidi_fails_req2
/ # RandALCat followed by LCat
(?<r_and_al_cat>#{bidi_R_AL.source})
.*?
(?<l_cat>#{bidi_L.source})
| # RandALCat preceded by LCat
\g<l_cat> .*? \g<r_and_al_cat>
/mux
end
def bidi_fails_req3
/ # contains RandALCat but doesn't start with RandALCat
\A(?<not_r_nor_al>#{bidi_not_R_AL})
.*?
(?<r_and_al_cat>#{bidi_R_AL})
| # contains RandALCat but doesn't end with RandALCat
\g<r_and_al_cat> .*? \g<not_r_nor_al>\z
/mux
end
# shares the bidi_R_AL definition between both req2 and req3
def bidi_failure_regexp
req3_with_backref = bidi_fails_req3.source
.gsub(%r{\(\?\<r_and_al_cat\>\(.*?\)\)}, "\g<r_and_al_cat>")
.then{|re|"(?mx-i:#{re})"}
/#{bidi_fails_req2} | #{req3_with_backref}/mux
end
def bidi_consts
<<~RUBY
#############
# Bidirectional checks.
#
RUBY
end
SASL_TABLES_PROHIBITED = %w[
C.1.2 C.2.1 C.2.2 C.3 C.4 C.5 C.6 C.7 C.8 C.9
].freeze
SASL_TABLES_PROHIBITED_STORED = %w[
A.1 C.1.2 C.2.1 C.2.2 C.3 C.4 C.5 C.6 C.7 C.8 C.9
].freeze
end