本文实例讲述了python自定义解析简单xml格式文件的方法。分享给大家供大家参考。具体分析如下:
因为公司内部的接口返回的字串支持2种形式:php数组,xml;结果php数组python不能直接用,而xml字符串的格式不是标准的,所以也不能用标准模块解析。【不标准的地方是某些节点会的名称是以数字开头的】,所以写个简单的脚步来解析一下文件,用来做接口测试。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
#!/usr/bin/env python #encoding: utf-8 import re class xmlparse: def __init__( self , xmlstr): self .xmlstr = xmlstr self .xmldom = self .__convet2utf8() self .xmlnodelist = [] self .xpath = '' def __convet2utf8( self ): headstr = self .__get_head() xmldomstr = self .xmlstr.replace(headstr, '') if 'gbk' in headstr: xmldomstr = xmldomstr.decode( 'gbk' ).encode( 'utf-8' ) elif 'gb2312' in headstr: xmldomstr = self .xmlstr.decode( 'gb2312' ).encode( 'utf-8' ) return xmldomstr def __get_head( self ): headpat = r '<\?xml.*\?>' headpatobj = re. compile (headpat) headregobj = headpatobj.match( self .xmlstr) if headregobj: headstr = headregobj.group() return headstr else : return '' def parse( self , xpath): self .xpath = xpath xpatlist = [] xpatharr = self .xpath.split( '/' ) for xnode in xpatharr: if xnode: spcindex = xnode.find( '[' ) if spcindex > - 1 : index = int (xnode[spcindex + 1 : - 1 ]) xnode = xnode[:spcindex] else : index = 0 ; temppat = ( '<%s>(.*?)</%s>' % (xnode, xnode),index) xpatlist.append(temppat) xmlnodestr = self .xmldom for xpat,index in xpatlist: xmlnodelist = re.findall(xpat,xmlnodestr) xmlnodestr = xmlnodelist[index] if xmlnodestr.startswith(r '<![CDATA[' ): xmlnodestr = xmlnodestr.replace(r '<![CDATA[' ,'')[: - 3 ] self .xmlnodelist = xmlnodelist return xmlnodestr if '__main__' = = __name__: xmlstr = '<?xml version="1.0" encoding="utf-8" standalone="yes" ?><resultObject><a><product_id>aaaaa</product_id><product_name><![CDATA[bbbbb]]></a><b><product_id>bbbbb</product_id><product_name><![CDATA[bbbbb]]></b></product_name></resultObject>' xpath1 = '/product_id' xpath2 = '/product_id[1]' xpath3 = '/a/product_id' xp = xmlparse(xmlstr) print 'xmlstr:' ,xp.xmlstr print 'xmldom:' ,xp.xmldom print '------------------------------' getstr = xp.parse(xpath1) print 'xpath:' ,xp.xpath print 'get list:' ,xp.xmlnodelist print 'get string:' , getstr print '------------------------------' getstr = xp.parse(xpath2) print 'xpath:' ,xp.xpath print 'get list:' ,xp.xmlnodelist print 'get string:' , getstr print '------------------------------' getstr = xp.parse(xpath3) print 'xpath:' ,xp.xpath print 'get list:' ,xp.xmlnodelist print 'get string:' , getstr |
运行结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
xmlstr: <?xml version = "1.0" encoding = "utf-8" standalone = "yes" ?><resultObject><a><product_id>aaaaa< / product_id><product_name><![CDATA[bbbbb]]>< / a><b><product_id>bbbbb< / product_id><product_name><![CDATA[bbbbb]]>< / b>< / product_name>< / resultObject> xmldom: <resultObject><a><product_id>aaaaa< / product_id><product_name><![CDATA[bbbbb]]>< / a><b><product_id>bbbbb< / product_id><product_name><![CDATA[bbbbb]]>< / b>< / product_name>< / resultObject> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - xpath: / product_id get list : [ 'aaaaa' , 'bbbbb' ] get string: aaaaa - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - xpath: / product_id[ 1 ] get list : [ 'aaaaa' , 'bbbbb' ] get string: bbbbb - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - xpath: / a / product_id get list : [ 'aaaaa' ] get string: aaaaa |
因为返回的xml格式比较简单,没有带属性的节点,所以处理起来就比较简单了。但测试还是发现有一个bug。即当相同节点嵌套时会出现正则匹配出问题,该问题的可以通过避免在xpath中出现有嵌套节点的名称来解决,否则只有重写复杂的机制了。
希望本文所述对大家的Python程序设计有所帮助。