Class: Meteor::Ml::Html::ParserImpl

Inherits:
Core::Kernel show all
Defined in:
lib/meteor.rb,
lib/meteor.rb

Overview

HTML parser (HTMLパーサ)

Direct Known Subclasses

Meteor::Ml::Html5::ParserImpl

Constant Summary

KAIGYO_CODE =

KAIGYO_CODE = "r?n|r" KAIGYO_CODE = "rn|n|r"

["\r\n", "\n", "\r"]
NBSP_2 =
' '
NBSP_3 =
'nbsp'
BR_1 =
"\r?\n|\r"
BR_2 =
'<br>'
META =
'META'
META_S =
'meta'
HTTP_EQUIV =
'http-equiv'
CONTENT_TYPE =
'Content-Type'
CONTENT =
'content'
OPTION =
'option'
SELECTED =
'selected'
INPUT =
'input'
CHECKED =
'checked'
RADIO =
'radio'
DISABLE_ELEMENT =

DISABLE_ELEMENT = "input|textarea|select|optgroup"

['input', 'textarea', 'select', 'optgroup']
DISABLED =
'disabled'
READONLY_TYPE =

READONLY_TYPE = "text|password"

['text', 'password']
TEXTAREA =
'textarea'
READONLY =
'readonly'
SELECT =
'select'
MULTIPLE =
'multiple'
SELECTED_M =

@@pattern_option = Regexp.new(OPTION) @@pattern_selected = Regexp.new(SELECTED) @@pattern_input = Regexp.new(INPUT) @@pattern_checked = Regexp.new(CHECKED) @@pattern_radio = Regexp.new(RADIO) @@pattern_disable_element = Regexp.new(DISABLE_ELEMENT) @@pattern_disabled = Regexp.new(DISABLED) @@pattern_readonly_type = Regexp.new(READONLY_TYPE) @@pattern_textarea = Regexp.new(TEXTAREA) @@pattern_readonly = Regexp.new(READONLY) @@pattern_select = Regexp.new(SELECT) @@pattern_multiple = Regexp.new(MULTIPLE)

'\\sselected\\s|\\sselected$|\\sSELECTED\\s|\\sSELECTED$'
SELECTED_R =

SELECTED_M = [' selected ',' selected',' SELECTED ',' SELECTED']

'selected\\s|selected$|SELECTED\\s|SELECTED$'
CHECKED_M =
'\\schecked\\s|\\schecked$|\\sCHECKED\\s|\\sCHECKED$'
CHECKED_R =

CHECKED_M = [' checked ',' checked',' CHECKED ',' CHECKED']

'checked\\s|checked$|CHECKED\\s|CHECKED$'
DISABLED_M =
'\\sdisabled\\s|\\sdisabled$|\\sDISABLED\\s|\\sDISABLED$'
DISABLED_R =

DISABLED_M = [' disabled ',' disiabled',' DISABLED ',' DISABLED']

'disabled\\s|disabled$|DISABLED\\s|DISABLED$'
READONLY_M =
'\\sreadonly\\s|\\sreadonly$|\\sREADONLY\\s|\\sREADONLY$'
READONLY_R =

READONLY_M = [' readonly ',' readonly',' READONLY ',' READONLY']

'readonly\\s|readonly$|READONLY\\s|READONLY$'
MULTIPLE_M =
'\\smultiple\\s|\\smultiple$|\\sMULTIPLE\\s|\\sMULTIPLE$'
MULTIPLE_R =

MULTIPLE_M = [' multiple ',' multiple',' MULTIPLE ',' MULTIPLE']

'multiple\\s|multiple$|MULTIPLE\\s|MULTIPLE$'
TRUE =
'true'
FALSE =
'false'
TYPE_L =

@@pattern_true = Regexp.new(TRUE) @@pattern_false = Regexp.new(FALSE)

'type'
TYPE_U =
'TYPE'
PATTERN_UNESCAPE =
'&(amp|quot|apos|gt|lt|nbsp);'
GET_ATTRS_MAP2 =
'\\s(disabled|readonly|checked|selected|multiple)'
TABLE_FOR_ESCAPE_ =
{
    '&' => '&amp;',
    '"' => '&quot;',
    '\'' => '&apos;',
    '<' => '&lt;',
    '>' => '&gt;',
    ' ' => '&nbsp;',
}
TABLE_FOR_ESCAPE_CONTENT_ =
{
    '&' => '&amp;',
    '"' => '&quot;',
    '\'' => '&apos;',
    '<' => '&lt;',
    '>' => '&gt;',
    ' ' => '&nbsp;',
    "\r\n" => '<br>',
    "\r" => '<br>',
    "\n" => '<br>',
}
PATTERN_ESCAPE =
"[&\"'<> ]"
PATTERN_ESCAPE_CONTENT =
"[&\"'<> \\n]"
@@match_tag =

MATCH_TAG = "br|hr|img|input|meta|base"

['br', 'hr', 'img', 'input', 'meta', 'base']
@@match_tag_2 =

@@match_tag_2 = "textarea|option|pre"

['textarea', 'option', 'pre']
@@match_tag_sng =
Array

入れ子にできない要素

['texarea', 'select', 'option', 'form', 'fieldset']
@@attr_logic =
Array

論理値で指定する属性

['disabled', 'readonly', 'checked', 'selected', 'multiple']
@@pattern_selected_m =
Regexp.new(SELECTED_M)
@@pattern_selected_r =
Regexp.new(SELECTED_R)
@@pattern_checked_m =
Regexp.new(CHECKED_M)
@@pattern_checked_r =
Regexp.new(CHECKED_R)
@@pattern_disabled_m =
Regexp.new(DISABLED_M)
@@pattern_disabled_r =
Regexp.new(DISABLED_R)
@@pattern_readonly_m =
Regexp.new(READONLY_M)
@@pattern_readonly_r =
Regexp.new(READONLY_R)
@@pattern_multiple_m =
Regexp.new(MULTIPLE_M)
@@pattern_multiple_r =
Regexp.new(MULTIPLE_R)
@@pattern_unescape =
Regexp.new(PATTERN_UNESCAPE)
@@pattern_get_attrs_map2 =
Regexp.new(GET_ATTRS_MAP2)
@@pattern_escape =
Regexp.new(PATTERN_ESCAPE)
@@pattern_escape_content =
Regexp.new(PATTERN_ESCAPE_CONTENT)
@@pattern_br_2 =
Regexp.new(BR_2)
@@pattern_and_1 =
Regexp.new(AND_1)
@@pattern_lt_1 =
Regexp.new(LT_1)
@@pattern_gt_1 =
Regexp.new(GT_1)
@@pattern_dq_1 =
Regexp.new(DOUBLE_QUATATION)
@@pattern_space_1 =
Regexp.new(SPACE)
@@pattern_br_1 =
Regexp.new(BR_1)
@@pattern_lt_2 =
Regexp.new(LT_2)
@@pattern_gt_2 =
Regexp.new(GT_2)
@@pattern_dq_2 =
Regexp.new(QO_2)
@@pattern_space_2 =
Regexp.new(NBSP_2)
@@pattern_and_2 =
Regexp.new(AND_2)

Constants inherited from Core::Kernel

AND_1, AND_2, AND_3, AP_1, AP_2, AP_3, ATTR_EQ, CLEAN_1, CLEAN_2, DOUBLE_QUATATION, EMPTY, EN_1, ERASE_ATTR_1, ESCAPE_ENTITY_REF, GET_ATTRS_MAP, GET_ATTR_1, GT_1, GT_2, GT_3, LT_1, LT_2, LT_3, MODE, MODE_AF, MODE_BF, MODE_UTF8, PATTERN_FIND_1, PATTERN_FIND_2_1, PATTERN_FIND_2_2, PATTERN_FIND_2_3, PATTERN_FIND_3, PATTERN_FIND_4, PATTERN_FIND_5, QO_2, QO_3, SEARCH_CX_1, SEARCH_CX_2, SEARCH_CX_3, SEARCH_CX_4, SEARCH_CX_5, SEARCH_CX_6, SET_ATTR_1, SET_CX_1, SET_CX_2, SET_CX_3, SET_CX_4, SET_MONO_1, SPACE, TAG_CLOSE, TAG_CLOSE3, TAG_OPEN, TAG_OPEN3, TAG_SEARCH_1_1, TAG_SEARCH_1_2, TAG_SEARCH_1_3, TAG_SEARCH_1_4, TAG_SEARCH_1_4_2, TAG_SEARCH_2_1, TAG_SEARCH_2_1_2, TAG_SEARCH_2_2, TAG_SEARCH_2_2_2, TAG_SEARCH_2_3, TAG_SEARCH_2_3_2, TAG_SEARCH_2_3_2_2, TAG_SEARCH_2_4, TAG_SEARCH_2_4_2, TAG_SEARCH_2_4_2_2, TAG_SEARCH_2_4_2_3, TAG_SEARCH_2_4_3, TAG_SEARCH_2_4_3_2, TAG_SEARCH_2_4_4, TAG_SEARCH_2_6, TAG_SEARCH_2_7, TAG_SEARCH_3_1, TAG_SEARCH_3_1_2, TAG_SEARCH_3_1_2_2, TAG_SEARCH_3_2, TAG_SEARCH_3_2_2, TAG_SEARCH_3_2_2_2, TAG_SEARCH_4_1, TAG_SEARCH_4_2, TAG_SEARCH_4_3, TAG_SEARCH_4_4, TAG_SEARCH_4_5, TAG_SEARCH_4_6, TAG_SEARCH_4_7, TAG_SEARCH_4_7_2, TAG_SEARCH_NC_1_1, TAG_SEARCH_NC_1_2, TAG_SEARCH_NC_1_3, TAG_SEARCH_NC_1_4, TAG_SEARCH_NC_1_4_2, TAG_SEARCH_NC_2_1, TAG_SEARCH_NC_2_1_2, TAG_SEARCH_NC_2_2, TAG_SEARCH_NC_2_2_2, TAG_SEARCH_NC_2_3, TAG_SEARCH_NC_2_3_2, TAG_SEARCH_NC_2_3_2_2, TAG_SEARCH_NC_2_4, TAG_SEARCH_NC_2_4_2, TAG_SEARCH_NC_2_4_2_2, TAG_SEARCH_NC_2_4_2_3, TAG_SEARCH_NC_2_4_3, TAG_SEARCH_NC_2_4_3_2, TAG_SEARCH_NC_2_4_4, TAG_SEARCH_NC_2_6, TAG_SEARCH_NC_2_7, TAG_SEARCH_NC_3_1, TAG_SEARCH_NC_3_1_2, TAG_SEARCH_NC_3_1_2_2, TAG_SEARCH_NC_3_2, TAG_SEARCH_NC_3_2_2, TAG_SEARCH_NC_3_2_2_2

Constants inherited from Parser

HTML, HTML5, XHTML, XHTML5, XML

Instance Attribute Summary

Attributes inherited from Core::Kernel

doc_type, document_hook, element_cache, element_hook

Instance Method Summary (collapse)

Methods inherited from Core::Kernel

#attr, #attr_map, #character_encoding, #character_encoding=, #content, #create_element_pattern, #cxtag, #document, #document=, #element, #execute, #find, #flush, #remove_element, #root_element, #shadow

Constructor Details

- (ParserImpl) initialize - (ParserImpl) initialize(ps)

initializer (イニシャライザ)

Overloads:



3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
# File 'lib/meteor.rb', line 3309

def initialize(*args)
  super()
  @doc_type = Parser::HTML
  case args.length
    when ZERO
      initialize_0
    when ONE
      initialize_1(args[0])
    else
      raise ArgumentError
  end
end

Instance Method Details

- (String) content_type

get content type (コンテントタイプを取得する)

Returns:

  • (String)

    conent type (コンテントタイプ)



3380
3381
3382
# File 'lib/meteor.rb', line 3380

def content_type
  @root.content_type
end

- (Object) escape(content)



3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
# File 'lib/meteor.rb', line 3815

def escape(content)
  #特殊文字の置換
  #「&」->「&amp;」
  if content.include?(AND_1) then
    content.gsub!(@@pattern_and_1, AND_2)
  end
  #「<」->「&lt;」
  if content.include?(LT_1) then
    content.gsub!(@@pattern_lt_1, LT_2)
  end
  #「>」->「&gt;」
  if content.include?(GT_1) then
    content.gsub!(@@pattern_gt_1, GT_2)
  end
  #「"」->「&quotl」
  if content.include?(DOUBLE_QUATATION) then
    content.gsub!(@@pattern_dq_1, QO_2)
  end
  #「 」->「&nbsp;」
  if content.include?(SPACE) then
    content.gsub!(@@pattern_space_1, NBSP_2)
  end

  content
end

- (Object) escape_content(content, elm)



3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
# File 'lib/meteor.rb', line 3822

def escape_content(content, elm)
  content = escape(content)

  if elm.cx || !is_match(@@match_tag_2, elm.name) then
    #「¥r?¥n」->「<br>」
    content.gsub!(@@pattern_br_1, BR_2)
  end

  content
end

- (Object) parse(document)

set document in parser (ドキュメントをパーサにセットする)

Parameters:

  • document (String)

    document (ドキュメント)



3347
3348
3349
3350
# File 'lib/meteor.rb', line 3347

def parse(document)
  @root.document = document
  analyze_ml()
end

- (Object) read(file_path, encoding)

read file , set in parser (ファイルを読み込み、パーサにセットする)

Parameters:

  • filePath (String)

    file path (ファイルパス)

  • encoding (String)

    character encoding (エンコーディング)



3357
3358
3359
3360
# File 'lib/meteor.rb', line 3357

def read(file_path, encoding)
  super(file_path, encoding)
  analyze_ml()
end