{"id":360,"date":"2014-07-03T12:49:52","date_gmt":"2014-07-03T12:49:52","guid":{"rendered":"http:\/\/ixyzero.com\/blog\/?p=360"},"modified":"2014-07-03T12:49:52","modified_gmt":"2014-07-03T12:49:52","slug":"%e7%bd%91%e7%ab%99%e9%93%be%e6%8e%a5%e6%8a%93%e5%8f%96%e8%84%9a%e6%9c%acbak","status":"publish","type":"post","link":"https:\/\/ixyzero.com\/blog\/archives\/360.html","title":{"rendered":"\u7f51\u7ad9\u94fe\u63a5\u6293\u53d6\u811a\u672c[bak]"},"content":{"rendered":"<p>\u5229\u7528Python\u7684HTMLParser\u6a21\u5757\u5199\u7684\u94fe\u63a5\u6293\u53d6\u811a\u672c\uff0c\u5176\u5b9e\u53ef\u4ee5\u6539\u4e00\u6539\uff0c\u8ba9\u5b83\u53ea\u6293\u53d6\u975e\u6307\u5b9a\u57df\u540d\u7684\u94fe\u63a5\u5730\u5740\uff0c\u7136\u540e\u914d\u5408\u4e00\u4e2a\u767d\u540d\u5355\u5c31\u53ef\u4ee5\u8fdb\u884c\u7b80\u5355\u7684\u6302\u9a6c\u68c0\u6d4b\u4e86`(*\u2229_\u2229*)\u2032\uff0c\u811a\u672c\u5982\u4e0b\uff1a<\/p>\n<pre class=\"lang:python decode:true \">#!\/usr\/bin\/env python\n#coding: utf-8\n#Attention: ValueError: unknown url type: www.baidu.com --&gt; http:\/\/www.baidu.com\n\nimport sys\nimport urllib2\nimport re\nimport HTMLParser\n\nreload(sys)\nsys.setdefaultencoding('utf8')\n\nclass myparser(HTMLParser.HTMLParser):\n    def __init__(self):\n        HTMLParser.HTMLParser.__init__(self)\n    def handle_starttag(self, tag, attrs):\n        if (tag == 'a')|(tag == 'img'):#\u53ef\u6839\u636e\u9700\u8981\u8fdb\u884c\u589e\u5220\u6539\n            for name, value in attrs:\n                if (name == 'href')|(name == 'src'):#\u67e5\u8be2\u8be5\u4e0a\u9762\u4e24\u4e2a\u6807\u7b7e\u7684\u5c5e\u6027\n                    val = re.search('http:\/\/', value)#\u5339\u914d\u94fe\u63a5\u662f\u5426\u4e3a\u53ef\u7528\u94fe\u63a5\n                    if val != None:\n                        print value\n\nif len(sys.argv)==3 and sys.argv[1] == '-u':\n    content = (urllib2.urlopen(sys.argv[2])).read()#\u6253\u5f00\u7f51\u5740\u5e76\u8bfb\u53d6\u5185\u5bb9\n    con = myparser()\n    con.feed(content)#\u628acontent\u7684\u5185\u5bb9\uff0c\u4f20\u7ed9myparser\u5206\u6790\nelse:\n    print 'Usage:%s -u url'%sys.argv[0]<\/pre>\n<p>\u6682\u65f6\u8fd8\u6ca1\u6709\u4fee\u6539\uff0c\u4ee5\u540e\u6709\u673a\u4f1a\u518d\u8bf4\u5427\u2026\u2026<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u5229\u7528Python\u7684HTMLParser\u6a21\u5757\u5199\u7684\u94fe\u63a5\u6293\u53d6\u811a\u672c\uff0c\u5176\u5b9e\u53ef\u4ee5\u6539\u4e00\u6539\uff0c\u8ba9\u5b83\u53ea\u6293\u53d6\u975e\u6307\u5b9a\u57df\u540d\u7684\u94fe\u63a5\u5730\u5740\uff0c\u7136 [&hellip;]<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[7,12],"tags":[175,8],"class_list":["post-360","post","type-post","status-publish","format-standard","hentry","category-programing","category-tools","tag-htmlparser","tag-python"],"views":3255,"_links":{"self":[{"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/posts\/360","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/comments?post=360"}],"version-history":[{"count":0,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/posts\/360\/revisions"}],"wp:attachment":[{"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/media?parent=360"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/categories?post=360"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/tags?post=360"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}