REXMLでXMLをパース

REXMLでできるパースの種類は4種類あるようです

  • tree parsing API・・・DOMのようなもの
  • stream parsing API・・・SAXのようなもの。SAXより速いらしい
  • SAX2-based API・・・SAX2ベースドエーピーアイ
  • Pull parsing API・・・Java SE 6で入ったStAXと似ている。pull型。書き出しはできない模様

http://www.germane-software.com/software/rexml/
http://www.germane-software.com/software/rexml/docs/tutorial.html


tree parsing API

XPathが使える

require "rexml/document"

XML = <<EOS
<?xml version="1.0" encoding="UTF-8" ?>
  <foo>
    <bar>baz</bar>
</foo>
EOS

dom = REXML::Document.new(XML)
root_element = dom.root
puts root_element.name
puts root_element.children[1].name
puts root_element.children[1].children[0].value
puts "0[#{root_element.children[0].value}]"
#puts "1[#{root_element.children[1].value}]" => undefined method `value' for <bar> ... </>:REXML::Element (NoMethodError)
puts "2[#{root_element.children[2].value}]"

#XPathで要素を取り出す
puts dom.elements['/foo'].name
puts dom.elements['/foo/bar'].name
puts dom.elements['/foo/bar'].text
dom.elements['/foo'].children.each_index do |i, e|
  puts "#{i}[#{e.value if e.respond_to? :value}]"
end

#XML文書の作成
doc = REXML::Document.new
doc << REXML::XMLDecl.new("1.0", "UTF-8")
foo = doc.add_element "foo"
foo.add_element("bar").add_text "baz"
puts doc 

実行結果

# ruby dom.rb 
foo
bar
baz
0[
    ]
2[
]
foo
bar
baz
0[]
1[]
2[]
<?xml version='1.0' encoding='UTF-8'?><foo><bar>baz</bar></foo>

stream parsing API

require "rexml/parsers/streamparser"
require "rexml/parsers/baseparser"
require "rexml/streamlistener"

XML = <<EOS
<?xml version="1.0" encoding="UTF-8" ?>
  <foo>
    <bar>baz</bar>
</foo>
EOS

class MyListener 
  include REXML::StreamListener

  def tag_start(name, attrs)
    puts "tag_start:#{name}"
  end

  def tag_end(name)
    puts "tag_end:#{name}"
  end
  
  def text(text)
    puts "text:[#{text}]"
  end
end

puts REXML::Parsers::StreamParser.new(XML, MyListener.new).parse

実行結果

# ruby stream_parser.rb 
text:[
  ]
tag_start:foo
text:[
    ]
tag_start:bar
text:[baz]
tag_end:bar
text:[
]
tag_end:foo
text:[
]
nil

SAX2-based API

require "rexml/parsers/sax2parser"
require "rexml/parsers/baseparser"
require "rexml/sax2listener"

XML = <<EOS
<?xml version="1.0" encoding="UTF-8" ?>
  <foo>
    <bar>baz</bar>
</foo>
EOS

class MyEventListener
  def receive(event)
    p event
  end
end

class MyListener 
  include REXML::SAX2Listener

  def start_document
    puts "started..."
  end
  
  def end_document
    puts "ended..."
  end
end

p = REXML::Parsers::SAX2Parser.new(XML)
p.add_listener(MyEventListener.new)
p.listen(MyListener.new)
p.parse

実行結果

# ruby sax.rb 
started...
[:xmldecl, "1.0", "UTF-8", nil]
[:text, "\n  "]
[:start_element, "foo", {}]
[:text, "\n    "]
[:start_element, "bar", {}]
[:text, "baz"]
[:end_element, "bar"]
[:text, "\n"]
[:end_element, "foo"]
[:text, "\n"]
[:end_document]
ended...

Pull parsing API

require "rexml/parsers/pullparser"

XML = <<EOS
<?xml version="1.0" encoding="UTF-8" ?>
  <foo>
    <bar>baz</bar>
</foo>
EOS

p = REXML::Parsers::PullParser.new(XML)

while p.has_next?
  res = p.pull
  
  case res.event_type
  when :start_element
    puts "start_element:#{res[0]}"
  when :text
    puts "text:[#{res[0]}]"
  when :end_element
    puts "end_element:#{res[0]}"
  end 
end 

実行結果

# ruby pull_parser.rb 
text:[
  ]
start_element:foo
text:[
    ]
start_element:bar
text:[baz]
end_element:bar
text:[
]
end_element:foo
text:[
]