245 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			245 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
# frozen_string_literal: true
 | 
						|
 | 
						|
require 'singleton'
 | 
						|
 | 
						|
#  See also `app/javascript/features/account/util/bio_metadata.js`.
 | 
						|
 | 
						|
class FrontmatterHandler
 | 
						|
  include Singleton
 | 
						|
 | 
						|
  #  CONVENIENCE FUNCTIONS  #
 | 
						|
 | 
						|
  def self.unirex(str)
 | 
						|
    Regexp.new str, Regexp::MULTILINE, 'u'
 | 
						|
  end
 | 
						|
  def self.rexstr(exp)
 | 
						|
    '(?:' + exp.source + ')'
 | 
						|
  end
 | 
						|
 | 
						|
  #  CHARACTER CLASSES  #
 | 
						|
 | 
						|
  DOCUMENT_START    = /^/
 | 
						|
  DOCUMENT_END      = /$/
 | 
						|
  ALLOWED_CHAR      =  #  c-printable` in the YAML 1.2 spec.
 | 
						|
    /[\t\n\r\u{20}-\u{7e}\u{85}\u{a0}-\u{d7ff}\u{e000}-\u{fffd}\u{10000}-\u{10ffff}]/u
 | 
						|
  WHITE_SPACE       = /[ \t]/
 | 
						|
  INDENTATION       = / */
 | 
						|
  LINE_BREAK        = /\r?\n|\r|<br\s*\/?>/
 | 
						|
  ESCAPE_CHAR       = /[0abt\tnvfre "\/\\N_LP]/
 | 
						|
  HEXADECIMAL_CHARS = /[0-9a-fA-F]/
 | 
						|
  INDICATOR         = /[-?:,\[\]{}&#*!|>'"%@`]/
 | 
						|
  FLOW_CHAR         = /[,\[\]{}]/
 | 
						|
 | 
						|
  #  NEGATED CHARACTER CLASSES  #
 | 
						|
 | 
						|
  NOT_WHITE_SPACE   = unirex '(?!' + rexstr(WHITE_SPACE) + ').'
 | 
						|
  NOT_LINE_BREAK    = unirex '(?!' + rexstr(LINE_BREAK) + ').'
 | 
						|
  NOT_INDICATOR     = unirex '(?!' + rexstr(INDICATOR) + ').'
 | 
						|
  NOT_FLOW_CHAR     = unirex '(?!' + rexstr(FLOW_CHAR) + ').'
 | 
						|
  NOT_ALLOWED_CHAR  = unirex '(?!' + rexstr(ALLOWED_CHAR) + ').'
 | 
						|
 | 
						|
  #  BASIC CONSTRUCTS  #
 | 
						|
 | 
						|
  ANY_WHITE_SPACE   = unirex rexstr(WHITE_SPACE) + '*'
 | 
						|
  ANY_ALLOWED_CHARS = unirex rexstr(ALLOWED_CHAR) + '*'
 | 
						|
  NEW_LINE          = unirex(
 | 
						|
    rexstr(ANY_WHITE_SPACE) + rexstr(LINE_BREAK)
 | 
						|
  )
 | 
						|
  SOME_NEW_LINES    = unirex(
 | 
						|
    '(?:' + rexstr(ANY_WHITE_SPACE) + rexstr(LINE_BREAK) + ')+'
 | 
						|
  )
 | 
						|
  POSSIBLE_STARTS   = unirex(
 | 
						|
    rexstr(DOCUMENT_START) + rexstr(/<p[^<>]*>/) + '?'
 | 
						|
  )
 | 
						|
  POSSIBLE_ENDS     = unirex(
 | 
						|
    rexstr(SOME_NEW_LINES) + '|' +
 | 
						|
    rexstr(DOCUMENT_END) + '|' +
 | 
						|
    rexstr(/<\/p>/)
 | 
						|
  )
 | 
						|
  CHARACTER_ESCAPE  = unirex(
 | 
						|
    rexstr(/\\/) +
 | 
						|
    '(?:' +
 | 
						|
      rexstr(ESCAPE_CHAR) + '|' +
 | 
						|
      rexstr(/x/) + rexstr(HEXADECIMAL_CHARS) + '{2}' + '|' +
 | 
						|
      rexstr(/u/) + rexstr(HEXADECIMAL_CHARS) + '{4}' + '|' +
 | 
						|
      rexstr(/U/) + rexstr(HEXADECIMAL_CHARS) + '{8}' +
 | 
						|
    ')'
 | 
						|
  )
 | 
						|
  ESCAPED_CHAR      = unirex(
 | 
						|
    rexstr(/(?!["\\])/) + rexstr(NOT_LINE_BREAK) + '|' +
 | 
						|
    rexstr(CHARACTER_ESCAPE)
 | 
						|
  )
 | 
						|
  ANY_ESCAPED_CHARS = unirex(
 | 
						|
    rexstr(ESCAPED_CHAR) + '*'
 | 
						|
  )
 | 
						|
  ESCAPED_APOS      = unirex(
 | 
						|
    '(?=' + rexstr(NOT_LINE_BREAK) + ')' + rexstr(/[^']|''/)
 | 
						|
  )
 | 
						|
  ANY_ESCAPED_APOS  = unirex(
 | 
						|
    rexstr(ESCAPED_APOS) + '*'
 | 
						|
  )
 | 
						|
  FIRST_KEY_CHAR    = unirex(
 | 
						|
    '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
 | 
						|
    '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
 | 
						|
    rexstr(NOT_INDICATOR) + '|' +
 | 
						|
    rexstr(/[?:-]/) +
 | 
						|
    '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
 | 
						|
    '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
 | 
						|
    '(?=' + rexstr(NOT_FLOW_CHAR) + ')'
 | 
						|
  )
 | 
						|
  FIRST_VALUE_CHAR  = unirex(
 | 
						|
    '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
 | 
						|
    '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
 | 
						|
    rexstr(NOT_INDICATOR) + '|' +
 | 
						|
    rexstr(/[?:-]/) +
 | 
						|
    '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
 | 
						|
    '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
 | 
						|
    #  Flow indicators are allowed in values.
 | 
						|
  )
 | 
						|
  LATER_KEY_CHAR    = unirex(
 | 
						|
    rexstr(WHITE_SPACE) + '|' +
 | 
						|
    '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
 | 
						|
    '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
 | 
						|
    '(?=' + rexstr(NOT_FLOW_CHAR) + ')' +
 | 
						|
    rexstr(/[^:#]#?/) + '|' +
 | 
						|
    rexstr(/:/) + '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
 | 
						|
  )
 | 
						|
  LATER_VALUE_CHAR  = unirex(
 | 
						|
    rexstr(WHITE_SPACE) + '|' +
 | 
						|
    '(?=' + rexstr(NOT_LINE_BREAK) + ')' +
 | 
						|
    '(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
 | 
						|
    #  Flow indicators are allowed in values.
 | 
						|
    rexstr(/[^:#]#?/) + '|' +
 | 
						|
    rexstr(/:/) + '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
 | 
						|
  )
 | 
						|
 | 
						|
  #  YAML CONSTRUCTS  #
 | 
						|
 | 
						|
  YAML_START        = unirex(
 | 
						|
    rexstr(ANY_WHITE_SPACE) + rexstr(/---/)
 | 
						|
  )
 | 
						|
  YAML_END          = unirex(
 | 
						|
    rexstr(ANY_WHITE_SPACE) + rexstr(/(?:---|\.\.\.)/)
 | 
						|
  )
 | 
						|
  YAML_LOOKAHEAD    = unirex(
 | 
						|
    '(?=' +
 | 
						|
      rexstr(YAML_START) +
 | 
						|
      rexstr(ANY_ALLOWED_CHARS) + rexstr(NEW_LINE) +
 | 
						|
      rexstr(YAML_END) + rexstr(POSSIBLE_ENDS) +
 | 
						|
    ')'
 | 
						|
  )
 | 
						|
  YAML_DOUBLE_QUOTE = unirex(
 | 
						|
    rexstr(/"/) + rexstr(ANY_ESCAPED_CHARS) + rexstr(/"/)
 | 
						|
  )
 | 
						|
  YAML_SINGLE_QUOTE = unirex(
 | 
						|
    rexstr(/'/) + rexstr(ANY_ESCAPED_APOS) + rexstr(/'/)
 | 
						|
  )
 | 
						|
  YAML_SIMPLE_KEY   = unirex(
 | 
						|
    rexstr(FIRST_KEY_CHAR) + rexstr(LATER_KEY_CHAR) + '*'
 | 
						|
  )
 | 
						|
  YAML_SIMPLE_VALUE = unirex(
 | 
						|
    rexstr(FIRST_VALUE_CHAR) + rexstr(LATER_VALUE_CHAR) + '*'
 | 
						|
  )
 | 
						|
  YAML_KEY          = unirex(
 | 
						|
    rexstr(YAML_DOUBLE_QUOTE) + '|' +
 | 
						|
    rexstr(YAML_SINGLE_QUOTE) + '|' +
 | 
						|
    rexstr(YAML_SIMPLE_KEY)
 | 
						|
  )
 | 
						|
  YAML_VALUE        = unirex(
 | 
						|
    rexstr(YAML_DOUBLE_QUOTE) + '|' +
 | 
						|
    rexstr(YAML_SINGLE_QUOTE) + '|' +
 | 
						|
    rexstr(YAML_SIMPLE_VALUE)
 | 
						|
  )
 | 
						|
  YAML_SEPARATOR    = unirex(
 | 
						|
    rexstr(ANY_WHITE_SPACE) +
 | 
						|
    ':' + rexstr(WHITE_SPACE) +
 | 
						|
    rexstr(ANY_WHITE_SPACE)
 | 
						|
  )
 | 
						|
  YAML_LINE         = unirex(
 | 
						|
    '(' + rexstr(YAML_KEY) + ')' +
 | 
						|
    rexstr(YAML_SEPARATOR) +
 | 
						|
    '(' + rexstr(YAML_VALUE) + ')'
 | 
						|
  )
 | 
						|
 | 
						|
  #  FRONTMATTER REGEX  #
 | 
						|
 | 
						|
  YAML_FRONTMATTER  = unirex(
 | 
						|
    rexstr(POSSIBLE_STARTS) +
 | 
						|
    rexstr(YAML_LOOKAHEAD) +
 | 
						|
    rexstr(YAML_START) + rexstr(SOME_NEW_LINES) +
 | 
						|
    '(?:' +
 | 
						|
      '(' + rexstr(INDENTATION) + ')' +
 | 
						|
      rexstr(YAML_LINE) + rexstr(SOME_NEW_LINES) +
 | 
						|
      '(?:' +
 | 
						|
        '\\1' + rexstr(YAML_LINE) + rexstr(SOME_NEW_LINES) +
 | 
						|
      '){0,4}' +
 | 
						|
    ')?' +
 | 
						|
    rexstr(YAML_END) + rexstr(POSSIBLE_ENDS)
 | 
						|
  )
 | 
						|
 | 
						|
  #  SEARCHES  #
 | 
						|
 | 
						|
  FIND_YAML_LINES   = unirex(
 | 
						|
    rexstr(NEW_LINE) + rexstr(INDENTATION) + rexstr(YAML_LINE)
 | 
						|
  )
 | 
						|
 | 
						|
  #  STRING PROCESSING  #
 | 
						|
 | 
						|
  def process_string(str)
 | 
						|
    case str[0]
 | 
						|
    when '"'
 | 
						|
      str[1..-2]
 | 
						|
        .gsub(/\\0/, "\u{00}")
 | 
						|
        .gsub(/\\a/, "\u{07}")
 | 
						|
        .gsub(/\\b/, "\u{08}")
 | 
						|
        .gsub(/\\t/, "\u{09}")
 | 
						|
        .gsub(/\\\u{09}/, "\u{09}")
 | 
						|
        .gsub(/\\n/, "\u{0a}")
 | 
						|
        .gsub(/\\v/, "\u{0b}")
 | 
						|
        .gsub(/\\f/, "\u{0c}")
 | 
						|
        .gsub(/\\r/, "\u{0d}")
 | 
						|
        .gsub(/\\e/, "\u{1b}")
 | 
						|
        .gsub(/\\ /, "\u{20}")
 | 
						|
        .gsub(/\\"/, "\u{22}")
 | 
						|
        .gsub(/\\\//, "\u{2f}")
 | 
						|
        .gsub(/\\\\/, "\u{5c}")
 | 
						|
        .gsub(/\\N/, "\u{85}")
 | 
						|
        .gsub(/\\_/, "\u{a0}")
 | 
						|
        .gsub(/\\L/, "\u{2028}")
 | 
						|
        .gsub(/\\P/, "\u{2029}")
 | 
						|
        .gsub(/\\x([0-9a-fA-F]{2})/mu) {|s| $1.to_i.chr Encoding::UTF_8}
 | 
						|
        .gsub(/\\u([0-9a-fA-F]{4})/mu) {|s| $1.to_i.chr Encoding::UTF_8}
 | 
						|
        .gsub(/\\U([0-9a-fA-F]{8})/mu) {|s| $1.to_i.chr Encoding::UTF_8}
 | 
						|
    when "'"
 | 
						|
      str[1..-2].gsub(/''/, "'")
 | 
						|
    else
 | 
						|
      str
 | 
						|
    end
 | 
						|
  end
 | 
						|
 | 
						|
  #  BIO PROCESSING  #
 | 
						|
 | 
						|
  def process_bio content
 | 
						|
    result = {
 | 
						|
      text: content.gsub(/"/, '"').gsub(/'/, "'"),
 | 
						|
      metadata: []
 | 
						|
    }
 | 
						|
    yaml = YAML_FRONTMATTER.match(result[:text])
 | 
						|
    return result unless yaml
 | 
						|
    yaml = yaml[0]
 | 
						|
    start = YAML_START =~ result[:text]
 | 
						|
    ending = start + yaml.length - (YAML_START =~ yaml)
 | 
						|
    result[:text][start..ending - 1] = ''
 | 
						|
    metadata = nil
 | 
						|
    index = 0
 | 
						|
    while metadata = FIND_YAML_LINES.match(yaml, index) do
 | 
						|
      index = metadata.end(0)
 | 
						|
      result[:metadata].push [
 | 
						|
        process_string(metadata[1]), process_string(metadata[2])
 | 
						|
      ]
 | 
						|
    end
 | 
						|
    return result
 | 
						|
  end
 | 
						|
 | 
						|
end
 |