A few functions I wrote to remove duplicate nodes. From my tests, it appears to be working:
require "nokogiri"
require "ruby-debug"
# the data we're testing
data = <<EOF
<bla>
<father>
<mini id="3"/>
<mini id="5"/>
<mini id="3"/>
</father>
</bla>
EOF
# check and see if the attributes are the same
# probably could be done shorter
def same_attributes?(attr1,attr2)
attr1.each do |k,v|
if attr2.has_key?(k)
a1v = v.value
a2v = attr2[k].value
if a1v != a2v
return false
end
else
return false
end
end
# do it the other way so no key is left out
attr2.each do |k,v|
if !attr1.has_key?(k)
return false
end
end
return true
end
# recursively check if 2 nodes are the same
def same_nodes?(node1,node2,truth_array=[])
if node1.nil? || node2.nil?
return false
end
if node1.name != node2.name
return false
end
if node1.text != node2.text
return false
end
node1_attrs = node1.attributes
node2_attrs = node2.attributes
truth_array << same_attributes?(node1_attrs,node2_attrs)
node1_kids = node1.children
node2_kids = node2.children
node1_kids.zip(node2_kids).each do |pair|
truth_array << same_nodes?(pair[0],pair[1])
end
# if every value in the array is true, then the nodes are equal
return truth_array.all?
end
# removes duplicate nodes recursively from a document
def remove_copies(node)
node_names = node.children.select {|kid| kid.name != "text" }.collect {|k| k.name}
node_names.uniq!
node_names.each {|name| remove_duplicates(node,name)}
node.children.each {|k| remove_copies(k)}
end
# remove named child duplicates from a node
def remove_duplicates(node,child_name)
ex_childs = node.children.select {|kid| kid.name == child_name}
node.children.each {|k| k.remove if k.name == child_name}
added_nodes = []
ex_childs.each do |ec|
add_me = true
added_nodes.each do |added_node|
if same_nodes?(added_node,ec)
add_me = false
end
end
if add_me
node.add_child(ec)
added_nodes << ec
end
end
end
node = Nokogiri::XML(data)
remove_copies(node)
node.children.each {|c| puts c}
I’m curious if there are other solutions to this problem.
0 Responses to “remove duplicate xml nodes using nokogiri”