12
Oct
09

remove duplicate xml nodes using nokogiri



A few functions I wrote to remove duplicate nodes. From my tests, it appears to be working:


require "nokogiri"
require "ruby-debug"

# the data we're testing
data = <<EOF
<bla>
	<father>
		<mini id="3"/>
		<mini id="5"/>
		<mini id="3"/>
	</father>
</bla>
EOF

# check and see if the attributes are the same
# probably could be done shorter
def same_attributes?(attr1,attr2)
	attr1.each do |k,v|
		if attr2.has_key?(k)
			a1v = v.value
			a2v = attr2[k].value
			if a1v != a2v
				return false
			end
		else
			return false
		end
	end
	# do it the other way so no key is left out
	attr2.each do |k,v|
		if !attr1.has_key?(k)
			return false
		end
	end
	return true
end

# recursively check if 2 nodes are the same
def same_nodes?(node1,node2,truth_array=[])
	if node1.nil? || node2.nil?
		return false
	end
	if node1.name != node2.name
		return false
	end
        if node1.text != node2.text
                return false
        end
	node1_attrs = node1.attributes
	node2_attrs = node2.attributes
	truth_array << same_attributes?(node1_attrs,node2_attrs)
	node1_kids = node1.children
	node2_kids = node2.children
	node1_kids.zip(node2_kids).each do |pair|
		truth_array << same_nodes?(pair[0],pair[1])
	end
	# if every value in the array is true, then the nodes are equal
	return truth_array.all?
end

# removes duplicate nodes recursively from a document
def remove_copies(node)
	node_names = node.children.select {|kid| kid.name != "text" }.collect {|k| k.name}
	node_names.uniq!
	node_names.each {|name| remove_duplicates(node,name)}
	node.children.each {|k| remove_copies(k)}
end

# remove named child duplicates from a node
def remove_duplicates(node,child_name)
	ex_childs = node.children.select {|kid| kid.name == child_name}
	node.children.each {|k| k.remove if k.name == child_name}
	added_nodes = []
	ex_childs.each do |ec|
		add_me = true
		added_nodes.each do |added_node|
			if same_nodes?(added_node,ec)
				add_me = false
			end
		end
		if add_me
			node.add_child(ec)
			added_nodes << ec
		end
	end
end

node = Nokogiri::XML(data)
remove_copies(node)
node.children.each {|c| puts c}

I’m curious if there are other solutions to this problem.


0 Responses to “remove duplicate xml nodes using nokogiri”



  1. Leave a Comment

Leave a comment


Blog Stats

  • 281,666 hits