neingeist
/
neinomaten
Archived
1
0
Fork 0
You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.

498 lines
15 KiB
Ruby

require 'htree/doc'
require 'htree/elem'
require 'htree/loc'
require 'htree/extract_text'
require 'uri'
module HTree
module Traverse
def doc?() Doc::Trav === self end
def elem?() Elem::Trav === self end
def text?() Text::Trav === self end
def xmldecl?() XMLDecl::Trav === self end
def doctype?() DocType::Trav === self end
def procins?() ProcIns::Trav === self end
def comment?() Comment::Trav === self end
def bogusetag?() BogusETag::Trav === self end
def get_subnode(*indexes)
n = self
indexes.each {|index|
n = n.get_subnode_internal(index)
}
n
end
end
module Container::Trav
# +each_child+ iterates over each child.
def each_child(&block) # :yields: child_node
children.each(&block)
nil
end
# +each_child_with_index+ iterates over each child.
def each_child_with_index(&block) # :yields: child_node, index
children.each_with_index(&block)
nil
end
# +find_element+ searches an element which universal name is specified by
# the arguments.
# It returns nil if not found.
def find_element(*names)
traverse_element(*names) {|e| return e }
nil
end
# +traverse_element+ traverses elements in the tree.
# It yields elements in depth first order.
#
# If _names_ are empty, it yields all elements.
# If non-empty _names_ are given, it should be list of universal names.
#
# A nested element is yielded in depth first order as follows.
#
# t = HTree('<a id=0><b><a id=1 /></b><c id=2 /></a>')
# t.traverse_element("a", "c") {|e| p e}
# # =>
# {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
# {emptyelem <a id="1">}
# {emptyelem <c id="2">}
#
# Universal names are specified as follows.
#
# t = HTree(<<'End')
# <html>
# <meta name="robots" content="index,nofollow">
# <meta name="author" content="Who am I?">
# </html>
# End
# t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
# # =>
# {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
# {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
#
def traverse_element(*names, &block) # :yields: element
if names.empty?
traverse_all_element(&block)
else
name_set = {}
names.each {|n| name_set[n] = true }
traverse_some_element(name_set, &block)
end
nil
end
def each_hyperlink_attribute
traverse_element(
'{http://www.w3.org/1999/xhtml}a',
'{http://www.w3.org/1999/xhtml}area',
'{http://www.w3.org/1999/xhtml}link',
'{http://www.w3.org/1999/xhtml}img',
'{http://www.w3.org/1999/xhtml}object',
'{http://www.w3.org/1999/xhtml}q',
'{http://www.w3.org/1999/xhtml}blockquote',
'{http://www.w3.org/1999/xhtml}ins',
'{http://www.w3.org/1999/xhtml}del',
'{http://www.w3.org/1999/xhtml}form',
'{http://www.w3.org/1999/xhtml}input',
'{http://www.w3.org/1999/xhtml}head',
'{http://www.w3.org/1999/xhtml}base',
'{http://www.w3.org/1999/xhtml}script') {|elem|
case elem.name
when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
attrs = ['href']
when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
attrs = ['src', 'longdesc', 'usemap']
when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
attrs = ['classid', 'codebase', 'data', 'usemap']
when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
attrs = ['cite']
when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
attrs = ['action']
when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
attrs = ['src', 'usemap']
when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
attrs = ['profile']
when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
attrs = ['src', 'for']
end
attrs.each {|attr|
if hyperlink = elem.get_attribute(attr)
yield elem, attr, hyperlink
end
}
}
end
private :each_hyperlink_attribute
# +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
# of A element.
#
# It yields HTree::Text (or HTree::Loc) and URI for each hyperlink.
#
# The URI objects are created with a base URI which is given by
# HTML BASE element or the argument ((|base_uri|)).
# +each_hyperlink_uri+ doesn't yields href of the BASE element.
def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
base_uri = URI.parse(base_uri) if String === base_uri
links = []
each_hyperlink_attribute {|elem, attr, hyperlink|
if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
base_uri = URI.parse(hyperlink.to_s)
else
links << hyperlink
end
}
if base_uri
links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
else
links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
end
end
# +each_hyperlink+ traverses hyperlinks such as HTML href attribute
# of A element.
#
# It yields HTree::Text or HTree::Loc.
#
# Note that +each_hyperlink+ yields HTML href attribute of BASE element.
def each_hyperlink # :yields: text
links = []
each_hyperlink_attribute {|elem, attr, hyperlink|
yield hyperlink
}
end
# +each_uri+ traverses hyperlinks such as HTML href attribute
# of A element.
#
# It yields URI for each hyperlink.
#
# The URI objects are created with a base URI which is given by
# HTML BASE element or the argument ((|base_uri|)).
def each_uri(base_uri=nil) # :yields: URI
each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
end
end
# :stopdoc:
module Doc::Trav
def traverse_all_element(&block)
children.each {|c| c.traverse_all_element(&block) }
end
end
module Elem::Trav
def traverse_all_element(&block)
yield self
children.each {|c| c.traverse_all_element(&block) }
end
end
module Leaf::Trav
def traverse_all_element
end
end
module Doc::Trav
def traverse_some_element(name_set, &block)
children.each {|c| c.traverse_some_element(name_set, &block) }
end
end
module Elem::Trav
def traverse_some_element(name_set, &block)
yield self if name_set.include? self.name
children.each {|c| c.traverse_some_element(name_set, &block) }
end
end
module Leaf::Trav
def traverse_some_element(name_set)
end
end
# :startdoc:
module Traverse
# +traverse_text+ traverses texts in the tree
def traverse_text(&block) # :yields: text
traverse_text_internal(&block)
nil
end
end
# :stopdoc:
module Container::Trav
def traverse_text_internal(&block)
each_child {|c| c.traverse_text_internal(&block) }
end
end
module Leaf::Trav
def traverse_text_internal
end
end
module Text::Trav
def traverse_text_internal
yield self
end
end
# :startdoc:
module Container::Trav
# +filter+ rebuilds the tree without some components.
#
# node.filter {|descendant_node| predicate } -> node
# loc.filter {|descendant_loc| predicate } -> node
#
# +filter+ yields each node except top node.
# If given block returns false, corresponding node is dropped.
# If given block returns true, corresponding node is retained and
# inner nodes are examined.
#
# +filter+ returns an node.
# It doesn't return location object even if self is location object.
#
def filter(&block)
subst = {}
each_child_with_index {|descendant, i|
if yield descendant
if descendant.elem?
subst[i] = descendant.filter(&block)
else
subst[i] = descendant
end
else
subst[i] = nil
end
}
to_node.subst_subnode(subst)
end
end
module Doc::Trav
# +title+ searches title and return it as a text.
# It returns nil if not found.
#
# +title+ searchs following information.
#
# - <title>...</title> in HTML
# - <title>...</title> in RSS
# - <title>...</title> in Atom
def title
e = find_element('title',
'{http://www.w3.org/1999/xhtml}title',
'{http://purl.org/rss/1.0/}title',
'{http://my.netscape.com/rdf/simple/0.9/}title',
'{http://www.w3.org/2005/Atom}title',
'{http://purl.org/atom/ns#}title')
e && e.extract_text
end
# +author+ searches author and return it as a text.
# It returns nil if not found.
#
# +author+ searchs following information.
#
# - <meta name="author" content="author-name"> in HTML
# - <link rev="made" title="author-name"> in HTML
# - <dc:creator>author-name</dc:creator> in RSS
# - <dc:publisher>author-name</dc:publisher> in RSS
# - <author><name>author-name</name></author> in Atom
def author
traverse_element('meta',
'{http://www.w3.org/1999/xhtml}meta') {|e|
begin
next unless e.fetch_attr('name').downcase == 'author'
author = e.fetch_attribute('content').strip
return author if !author.empty?
rescue IndexError
end
}
traverse_element('link',
'{http://www.w3.org/1999/xhtml}link') {|e|
begin
next unless e.fetch_attr('rev').downcase == 'made'
author = e.fetch_attribute('title').strip
return author if !author.empty?
rescue IndexError
end
}
if channel = find_element('{http://purl.org/rss/1.0/}channel')
channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
begin
author = e.extract_text.strip
return author if !author.empty?
rescue IndexError
end
}
channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
begin
author = e.extract_text.strip
return author if !author.empty?
rescue IndexError
end
}
end
['http://www.w3.org/2005/Atom', 'http://purl.org/atom/ns#'].each {|xmlns|
each_child {|top|
next unless top.elem?
if top.name == "{#{xmlns}}feed"
if feed_author = find_element("{#{xmlns}}author")
feed_author.traverse_element("{#{xmlns}}name") {|e|
begin
author = e.extract_text.strip
return author if !author.empty?
rescue IndexError
end
}
end
end
}
}
nil
end
end
module Doc::Trav
# +root+ searches root element.
# If there is no element on top level, it raise HTree::Error.
# If there is two or more elements on top level, it raise HTree::Error.
def root
es = []
children.each {|c| es << c if c.elem? }
raise HTree::Error, "no element" if es.empty?
raise HTree::Error, "multiple top elements" if 1 < es.length
es[0]
end
# +has_xmldecl?+ returns true if there is an XML declaration on top level.
def has_xmldecl?
children.each {|c| return true if c.xmldecl? }
false
end
end
module Elem::Trav
# +name+ returns the universal name of the element as a string.
#
# p HTree('<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>').root.name
# # =>
# "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF"
#
def name() element_name.universal_name end
# +qualified_name+ returns the qualified name of the element as a string.
#
# p HTree('<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>').root.qualified_name
# # =>
# "rdf:RDF"
def qualified_name() element_name.qualified_name end
# +attributes+ returns attributes as a hash.
# The hash keys are HTree::Name objects.
# The hash values are HTree::Text or HTree::Location objects.
#
# p HTree('<a name="xx" href="uu">').root.attributes
# # =>
# {href=>{text "uu"}, name=>{text "xx"}}
#
# p HTree('<a name="xx" href="uu">').make_loc.root.attributes
# # =>
# {href=>#<HTree::Location: doc()/a/@href>, name=>#<HTree::Location: doc()/a/@name>}
#
def attributes
result = {}
each_attribute {|name, text|
result[name] = text
}
result
end
def each_attr
each_attribute {|name, text|
uname = name.universal_name
str = text.to_s
yield uname, str
}
end
# call-seq:
# elem.fetch_attribute(name) -> text or raise IndexError
# elem.fetch_attribute(name, default) -> text or default
# elem.fetch_attribute(name) {|uname| default } -> text or default
#
# +fetch_attribute+ returns an attribute value as a text.
#
# elem may be an instance of HTree::Elem or a location points to it.
def fetch_attribute(uname, *rest, &block)
if 1 < rest.length
raise ArgumentError, "wrong number of arguments (#{1+rest.length} for 2)"
end
if !rest.empty? && block_given?
raise ArgumentError, "block supersedes default value argument"
end
uname = uname.universal_name if uname.respond_to? :universal_name
return update_attribute_hash.fetch(uname) {
if block_given?
return yield(uname)
elsif !rest.empty?
return rest[0]
else
raise IndexError, "attribute not found: #{uname.inspect}"
end
}
end
# call-seq:
# elem.fetch_attr(name) -> string or raise IndexError
# elem.fetch_attr(name, default) -> string or default
# elem.fetch_attr(name) {|uname| default } -> string or default
#
# +fetch_attr+ returns an attribute value as a string.
#
# elem may be an instance of HTree::Elem or a location points to it.
def fetch_attr(uname, *rest, &block)
if 1 < rest.length
raise ArgumentError, "wrong number of arguments (#{1+rest.length} for 2)"
end
if !rest.empty? && block_given?
raise ArgumentError, "block supersedes default value argument"
end
uname = uname.universal_name if uname.respond_to? :universal_name
return update_attribute_hash.fetch(uname) {
if block_given?
return yield(uname)
elsif !rest.empty?
return rest[0]
else
raise IndexError, "attribute not found: #{uname.inspect}"
end
}.to_s
end
def get_attribute(uname)
uname = uname.universal_name if uname.respond_to? :universal_name
update_attribute_hash[uname]
end
def get_attr(uname)
if text = update_attribute_hash[uname]
text.to_s
else
nil
end
end
end
end