使用parsel处理xml

2020/11/05 技术

使用parsel处理xml

1. xml解析

使用 parsel包处理解析 xml 数据:

from parsel import Selector
resp = Selector(text=self.xml, type="xml")
for e in resp.xpath('//node[@id="root"]'):
    print(e)

2. 删除 xml 节点

import unittest

from parsel import Selector


class TestXmlEdit(unittest.TestCase):
    def setUp(self) -> None:
        self.xml = '''<?xml version='1.0' encoding='UTF-8' standalone='yes' ?>
        <hierarchy>
            <node id="root">
                <node index="0" id="AAA" bounds="[1, 1]">
                    <node index="0" id="a" />
                    <node index="1" id="b" />
                </node>
                <node index="1" id="AAA" bounds="[1, 1]">
                    <node index="0" id="A" />
                    <node index="1" id="B" />
                </node>
            </node>
        </hierarchy>
        '''

    def testParselRemove(self):
        """ """

        def iter_node(_node: Selector):
            print("node: tag {}, id {}, index {}, child_count {}".format(
                _node.root.tag, _node.attrib.get("id"), _node.attrib.get("index"), len(_node.root)
            ))
            if _node.attrib.get("id") == "A":
                print()
                pass
            for _sub_index, _sub_node in enumerate(_node.root):
                assert _node.root[_sub_index] is _sub_node
                _sub_selector = Selector(root=_sub_node, namespaces=_node.namespaces, type=_node.type)
                iter_node(_node=_sub_selector)

        def iter_then_remove_node(_node: Selector):
            _remove_list = []
            for _sub_node in _node.root:
                _sub_node_id = _sub_node.attrib.get("id")
                _sub_node_index = _sub_node.attrib.get("index")
                if _sub_node_id == "AAA" and _sub_node_index == "0":
                    _remove_list.append(_sub_node)
                else:
                    _sub_selector = Selector(root=_sub_node, namespaces=_node.namespaces, type=_node.type)
                    iter_then_remove_node(_node=_sub_selector)

            if _remove_list:
                for _sub_node in _remove_list:
                    print("remove node: tag {}, id {}, index {}".format(
                        _sub_node.tag, _sub_node.attrib.get("id"), _sub_node.attrib.get("index")
                    ))
                    _node.root.remove(_sub_node)

        root = Selector(text=self.xml, type="xml")
        iter_node(_node=root)

        node_list = []
        print("\n\n\nbefore:")
        for tree in root.xpath('//node[@id="AAA"]'):
            print(tree.attrib["id"], tree.attrib["bounds"])
            iter_node(tree)
            node_list.append(tree)

        # 删除节点
        print("\n\n\nremove node:")
        iter_then_remove_node(_node=root)

        # root.remove(node_list[0])  # 报错 ValueError: Element is not a child of this node.
        print("\n\n\nafter remove:")
        for tree in root.xpath('//node[@id="AAA"]'):
            print(tree.attrib["id"], tree.attrib["bounds"])
            iter_node(tree)

3. 节点搜索举例

多个关系

# 节点内
resp.xpath('//node[@text="a" or @text="b"]')
resp.xpath('//node[@text="a" and @class="X"]')
resp.xpath('//node[( @text="a" or @text="b" ) and @class="X"]')
resp.xpath('//node[( @text="a" or @text="b" ) and @class="X" and not(.//node[@class="X"]) ]')

# 跨节点
resp.xpath('//node[@text="a"] | //node[@text="b"]')

正则匹配

namespaces={'re': "http://exslt.org/regular-expressions"}

resp.xpath('//node[re:test(@class, ".*FrameLayout")]', namespaces=namespaces)
resp.xpath('//a[re:test(@id, "^hypProduct_[0-9]+$")]', namespaces=namespaces)
resp.xpath('//a[re:test(@class, "(?<!EditText)$")]', namespaces=namespaces)
resp.xpath('//node[re:test(@class, ".*A") or re:test(@class, ".*B") ]', namespaces=namespaces)

相对位置

# 最后一个
resp.xpath('(//node[@text="X")])[last()]')

# A -> 父 -> 父 -> B
resp.xpath(( 
    '//node[@text="A"]'
    '/parent::node/parent::node/'
    'node[@text="B"]'
))

Search

    Table of Contents