{"id":5804,"date":"2024-12-12T21:00:51","date_gmt":"2024-12-12T13:00:51","guid":{"rendered":"https:\/\/ixyzero.com\/blog\/?p=5804"},"modified":"2024-12-12T21:00:51","modified_gmt":"2024-12-12T13:00:51","slug":"hive-sql%e4%b8%ad%e7%9a%84regexp_extract_all%e5%87%bd%e6%95%b0%e7%9a%84%e5%ad%a6%e4%b9%a0","status":"publish","type":"post","link":"https:\/\/ixyzero.com\/blog\/archives\/5804.html","title":{"rendered":"Hive SQL\u4e2d\u7684regexp_extract_all\u51fd\u6570\u7684\u5b66\u4e60"},"content":{"rendered":"\n<p>=Start=<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">\u7f18\u7531\uff1a<\/h4>\n\n\n\n<p>\u4e4b\u524d\u6ca1\u600e\u4e48\u7528\u8fc7 regexp_extract_all \u8fd9\u4e2a\u51fd\u6570\uff0c\u7528\u7684\u6bd4\u8f83\u591a\u7684\u662f regexp_extract \u8fd9\u4e2a\uff0c\u4f46\u662f\u8fd1\u671f\u5728\u5206\u6790\u5904\u7406\u4e00\u4e9b\u65e5\u5fd7\u7684\u65f6\u5019\u53d1\u73b0 regexp_extract_all \u8fd9\u4e2a\u51fd\u6570\u8fd8\u662f\u633a\u6709\u7528\u7684\uff0c\u6240\u4ee5\u6574\u7406\u603b\u7ed3\u4e00\u4e0b\uff0c\u65b9\u4fbf\u540e\u9762\u6709\u9700\u8981\u7684\u65f6\u5019\u53c2\u8003\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">\u6b63\u6587\uff1a<\/h4>\n\n\n\n<h5 class=\"wp-block-heading\">\u53c2\u8003\u89e3\u7b54\uff1a<\/h5>\n\n\n\n<h5 class=\"wp-block-heading\">\u8bed\u6cd5&amp;\u53c2\u6570&amp;\u8fd4\u56de<\/h5>\n\n\n\n<p><code>regexp_extract_all(str, regexp [, idx] )<\/code><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>str\uff1a\u5339\u914d\u7684 STRING \u8868\u8fbe\u5f0f\u3002<\/li>\n\n\n\n<li>regexp\uff1a\u5177\u6709\u5339\u914d\u6a21\u5f0f\u7684 STRING \u8868\u8fbe\u5f0f\u3002<\/li>\n\n\n\n<li>idx\uff1a\u5927\u4e8e\u6216\u7b49\u4e8e 0 \u7684\u53ef\u9009\u6574\u6570\u8868\u8fbe\u5f0f\uff0c\u9ed8\u8ba4\u503c\u4e3a 1\u3002<\/li>\n<\/ul>\n\n\n\n<p>\u8fd4\u56de ARRAY \u3002<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u5b57\u7b26\u4e32 regexp \u5fc5\u987b\u662f Java \u6b63\u5219\u8868\u8fbe\u5f0f\u3002<\/li>\n\n\n\n<li>\u4f7f\u7528\u6587\u672c\u65f6\uff0c\u8bf7\u4f7f\u7528raw-literal\uff08r \u524d\u7f00\uff09\u6765\u907f\u514d\u8f6c\u4e49\u5b57\u7b26\u9884\u5904\u7406\u3002<\/li>\n\n\n\n<li>regexp \u53ef\u4ee5\u5305\u542b\u591a\u4e2a\u7ec4\u3002 idx \u6307\u793a\u8981\u63d0\u53d6\u7684\u6b63\u5219\u8868\u8fbe\u5f0f\u7ec4\u3002 \u5982\u679c idx \u4e3a 0\uff0c\u5219\u8868\u793a\u5339\u914d\u6574\u4e2a\u6b63\u5219\u8868\u8fbe\u5f0f\u3002<\/li>\n<\/ul>\n\n\n\n<h5 class=\"wp-block-heading\">\u4e00\u4e9bSQL\u6837\u4f8b<\/h5>\n\n\n\n<pre class=\"wp-block-code\"><code>SELECT\nregexp_extract_all('100-200, 300-400', '(\\\\d+)-(\\\\d+)', 1) as v1\n,regexp_extract_all('100-200, 300-400', '(\\\\d+)-(\\\\d+)', 0) as vx\n,regexp_extract_all('100-200, 300-400', '(\\\\d+)', 1) as v2\n,regexp_extract_all('100-200, 300-400', '(\\\\d+)', 0) as v3\n,regexp_extract_all('100-200, 300-400', '\\\\d+', 0) as v4 --\u5982\u679c\u662f\u8981\u5339\u914d\u6240\u6709\u6570\u5b57\u7684\u8bdd\uff0cregexp_extract_all\u51fd\u6570\u7684\u7b2c3\u4e2a\u53c2\u6570\u4e00\u822c\u51990\uff0c\u8868\u793a\u5339\u914d\u6574\u4e2a\u6b63\u5219\u8868\u8fbe\u5f0f\n\n\n&#91;\"100\",\"300\"]\n&#91;\"100-200\",\"300-400\"]\n&#91;\"100\",\"200\",\"300\",\"400\"]\n&#91;\"100\",\"200\",\"300\",\"400\"]\n&#91;\"100\",\"200\",\"300\",\"400\"]<\/code><\/pre>\n\n\n\n<p>\u5bf9\u4e8eSQL\u8bed\u53e5\u7684\u89e3\u6790\u548c\u5185\u5bb9\u63d0\u53d6\uff0c\u6709\u4e00\u4e9b\u6ce8\u610f\u4e8b\u9879<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\u5148\u683c\u5f0f\u5316\u518d\u63d0\u53d6\uff08\u683c\u5f0f\u5316\u6307\u7684\u662f\u628a\u4e00\u4e9b\u6ce8\u91ca\u548c\u591a\u4f59\u7684\u7a7a\u767d\u7b26\u53f7\u8fdb\u884c\u66ff\u6362\uff0c\u907f\u514d\u540e\u9762\u5904\u7406\u810f\u6570\u636e\uff09<\/li>\n\n\n\n<li>\u7b80\u5355\u7c97\u66b4\u7684 regexp_extract_all \u597d\u5904\u5728\u4e8e\u4e00\u6b21\u63d0\u53d6\uff0c\u53cd\u590d\u4f7f\u7528\uff1b\u574f\u5904\u5728\u4e8e\u53ef\u80fd\uff08\u80af\u5b9a\uff09\u4f1a\u9047\u5230\u810f\u6570\u636e\u3010<em>\u63a8\u8350\u8fd9\u79cd\u65b9\u5f0f<\/em>\uff0c\u4f46\u9700\u8981\u505a\u4e00\u4e9b\u529b\u6240\u80fd\u53ca\u7684\u8fc7\u6ee4\uff0c\u51cf\u5c11\u540e\u9762\u810f\u6570\u636e\u51fa\u73b0\u7684\u6982\u7387\u3011<\/li>\n\n\n\n<li>\u5b9a\u5411 regexp_extract \u63d0\u53d6\u7684\u597d\u5904\u5728\u4e8e\u7cbe\u51c6\uff0c\u574f\u5904\u5728\u4e8e\u603b\u6709\u81ea\u5df1\u8003\u8651\u4e0d\u5468\u7684\u5730\u65b9\uff0c\u5bfc\u81f4\u9700\u8981\u53cd\u590d\u4fee\u6539\u548c\u91cd\u8dd1\u6570\u636e\uff0c\u5f71\u54cd\u6548\u7387\u548c\u6d6a\u8d39\u8d44\u6e90<\/li>\n\n\n\n<li>\u4e0d\u65ad\u5b66\u4e60\u3001\u4e0d\u65ad\u4f18\u5316\u66f4\u65b0<\/li>\n<\/ol>\n\n\n\n<p>\u63d0\u53d6SQL\u8bed\u53e5\u4e2d\u7684\u6240\u6709\u957f\u5ea6\u5927\u4e8e\u7b49\u4e8e5\u7684\u6570\u5b57\u5b57\u7b26\u4e32\uff08\u9700\u8981\u6ce8\u610f\u6392\u9664\u4e00\u4e9b\u4e0d\u9700\u8981\u7684\u6570\u5b57\uff0c\u6bd4\u5982\u65e5\u671f\u683c\u5f0f\u7684\u6570\u5b57\u5b57\u7b26\u4e32\u3001limit\u540e\u9762\u7684\u6570\u5b57\u5b57\u7b26\u4e32\u3001set\u8bed\u53e5\u540e\u9762\u53ef\u80fd\u5305\u542b\u7684\u6570\u5b57\u5b57\u7b26\u4e32\u7b49\u7b49\uff09<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>regexp_extract_all<\/li>\n\n\n\n<li>array_except &#8211; \u7528\u4e8e\u8ba1\u7b97\u4e24\u4e2a\u6570\u7ec4\u7684\u5dee\u96c6<\/li>\n<\/ul>\n\n\n\n<pre class=\"wp-block-code\"><code>-- \u793a\u4f8b\u4ee3\u7801\nwith t1 as (\n-- \u7528\u53cc\u5f15\u53f7\u62ec\u8d77\u6765\u7684\u5185\u5bb9\u53ef\u4ee5\u76f4\u63a5\u6362\u884c\u5199\uff0c\u53ea\u662f\u5176\u4e2d\u4e0d\u80fd\u5305\u542b\u82f1\u6587\u5206\u53f7\uff0c\u8fd9\u70b9\u8981\u6ce8\u610f\uff0c\u5177\u4f53\u539f\u56e0\u4e4b\u524d\u7684\u6587\u7ae0\u8bb0\u5f55\u8fc7\u4e86\uff0c\u5728\u6b64\u5c31\u4e0d\u8d58\u8ff0\nselect \"select\n1234567 as id_int ,'2345678' as id_str\nfrom db1.tbl1\nwhere\ndate between '20240101' and '20241201'\nor dt between '2024-01-01' and '2024-12-01'\nand condition1 &gt; 3456789\nand condition2 &lt; 9876543\nlimit 999999\nlimit 100001\" as multi_line_var\n)\n\nSELECT\nsize(num_array) as num_cnt\n,num_array\n-- ,num_array2\n,num_array3\n,array_except(num_array,yyyymmdd_array) as filted_num --\u6392\u9664\u6389\u5339\u914d\u51fa\u6765\u7684yyyymmdd\u65e5\u671f\u683c\u5f0f\u7684\u6570\u5b57\u5b57\u7b26\u4e32\n,num1\n,yyyy_mm_dd_array\n,yyyymmdd_array\n,limit_num\nFROM\n(select multi_line_var\n,regexp_extract_all(multi_line_var,'&#91;1-9]\\\\d{4,}',0) as num_array --\u53ef\u80fd\u4f1a\u628alimit\u540e\u9762\u7684\u6570\u5b57\u7ed9\u5339\u914d\u51fa\u6765\uff0c\u4e00\u79cd\u529e\u6cd5\u662f\u5728\u6b64\u4e4b\u524d\u5c31\u628alimit\u90a3\u4e00\u5c0f\u6bb5\u7ed9\u53bb\u6389\n-- ,regexp_extract_all(multi_line_var,'(?&lt;!limit\\\\s{1,})&#91;1-9]\\\\d{4,}',0) as num_array2 --\u53e6\u4e00\u79cd\u529e\u6cd5\u662f\u501f\u52a9\u6b63\u5219\u8868\u8fbe\u5f0f\u7684\u53cd\u5411\u5426\u5b9a\u9884\u67e5\u6765\u6392\u9664\uff0c\u4f46\u4e0d\u4e00\u5b9a\u80fd\u6392\u9664\u5e72\u51c0\uff0c\u6bd4\u5982\u8fd9\u91cc\u7684100001\u53ef\u4ee5\u88ab\u6392\u9664\uff0c\u4f46\u662f999999\u4e0d\u884c\uff0c\u800c\u4e14\u67e5\u8be2\u901f\u5ea6\u6162\u4e86\u5f88\u591a\n,regexp_extract_all(regexp_replace(multi_line_var,'\\\\s+limit\\\\s+(\\\\d+)\\\\s*','') ,'&#91;1-9]\\\\d{4,}',0) as num_array3\n,regexp_extract_all(multi_line_var,'20&#91;1-2]&#91;0-9]\\\\-(0&#91;1-9]|1&#91;0-2])\\\\-(0&#91;1-9]|1&#91;0-9]|2&#91;0-9]|3&#91;0-1])',0) as yyyy_mm_dd_array\n,regexp_extract_all(multi_line_var,'20&#91;1-2]&#91;0-9](0&#91;1-9]|1&#91;0-2])(0&#91;1-9]|1&#91;0-9]|2&#91;0-9]|3&#91;0-1])',0) as yyyymmdd_array\n,regexp_extract(multi_line_var, '\\\\s+limit\\\\s+(\\\\d+)\\\\s*', 1) as limit_num --regexp_extract\u51fd\u6570\u53ea\u4f1a\u63d0\u53d6\u7b2c\u4e00\u4e2a\u5339\u914d\u7684\u5185\u5bb9\uff0c\u8fd9\u91cc\u53ea\u662f\u4e3a\u4e86\u6f14\u793a\u6240\u4ee5\u5728\u4e00\u4e2aSQL\u8bed\u53e5\u91cc\u9762\u8fde\u7740\u5199\u4e862\u4e2alimit\nfrom t1\n)x1\nlateral view explode(num_array) num as num1\n\n# num_array\n&#91;\"1234567\",\"2345678\",\"20240101\",\"20241201\",\"3456789\",\"9876543\",\"999999\",\"100001\"]\n\n# num_array3\n&#91;\"1234567\",\"2345678\",\"20240101\",\"20241201\",\"3456789\",\"9876543\",\"100001\"]\n\n# filted_num\n&#91;\"1234567\",\"2345678\",\"3456789\",\"9876543\",\"999999\",\"100001\"]\n\n# yyyy_mm_dd_array\n&#91;\"2024-01-01\",\"2024-12-01\"]\n\n# yyyymmdd_array\n&#91;\"20240101\",\"20241201\"]\n\n# limit_num\n999999<\/code><\/pre>\n\n\n\n<p>\u63d0\u53d6SQL\u4e2d\u6240\u6709\u7684\u4e2d\u6587\u5b57\u7b26\u4e32<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>-- \u548c\u524d\u9762\u7684\u6570\u5b57\u5b57\u7b26\u4e32\u63d0\u53d6\u4e00\u4e2a\u5957\u8def\uff0c\u4e0d\u8fc7\u8fd9\u91cc\u7528\u7684\u662funicode\u4e2d\u5e38\u89c1\u4e2d\u6587\u5b57\u7b26\u7684\u7f16\u7801\u8303\u56f4\n,regexp_extract_all(query_sql,'&#91;\\\\u4e00-\\\\u9fa5]+',0) as chinese_array<\/code><\/pre>\n\n\n\n<h5 class=\"wp-block-heading\">\u53c2\u8003\u94fe\u63a5\uff1a<\/h5>\n\n\n\n<p>regexp_extract_all \u51fd\u6570<br><a href=\"https:\/\/learn.microsoft.com\/zh-cn\/azure\/databricks\/sql\/language-manual\/functions\/regexp_extract_all\">https:\/\/learn.microsoft.com\/zh-cn\/azure\/databricks\/sql\/language-manual\/functions\/regexp_extract_all<\/a><\/p>\n\n\n\n<p>\u6b63\u5219\u5f0f\u51fd\u6570<br><a href=\"https:\/\/cloud.tencent.com\/document\/product\/614\/63505\">https:\/\/cloud.tencent.com\/document\/product\/614\/63505<\/a><\/p>\n\n\n\n<p>HIVE\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u622a\u53d6\u91d1\u989d\u4fe1\u606f<br><a href=\"https:\/\/blog.csdn.net\/weixin_43929753\/article\/details\/135401116\">https:\/\/blog.csdn.net\/weixin_43929753\/article\/details\/135401116<\/a><\/p>\n\n\n\n<p>HIVE\u7528\u6237\u81ea\u4e49\u5b9a\u51fd\u6570:\u6b63\u5219\u5339\u914d\u6240\u6709\u5b50\u4e32<br><a href=\"https:\/\/juejin.cn\/post\/6898522310425362445\">https:\/\/juejin.cn\/post\/6898522310425362445<\/a><\/p>\n\n\n\n<p>hive\u81ea\u5b9a\u4e49\u51fd\u6570, \u76ee\u524d\u5305\u542b:regexp_extract_all<br><a href=\"https:\/\/github.com\/leeshuaichao\/hive_functions\">https:\/\/github.com\/leeshuaichao\/hive_functions<\/a><\/p>\n\n\n\n<p>\u6b63\u5219\u8868\u8fbe\u5f0f\u7684\u57fa\u672c\u4f7f\u7528<br><a href=\"https:\/\/docs.guandata.com\/product\/bi\/428146874701053952\">https:\/\/docs.guandata.com\/product\/bi\/428146874701053952<\/a><\/p>\n\n\n\n<p>hive \u6b63\u5219\u8868\u8fbe\u5f0f\u5339\u914d\u591a\u4e2a\u7ed3\u679c\u7684\u65b9\u6cd5<br><a href=\"https:\/\/blog.51cto.com\/u_16213440\/7602587\">https:\/\/blog.51cto.com\/u_16213440\/7602587<\/a><\/p>\n\n\n\n<p>hive regexp_extract weirdness<br><a href=\"https:\/\/stackoverflow.com\/questions\/8497090\/hive-regexp-extract-weirdness\/8497735#8497735\">https:\/\/stackoverflow.com\/questions\/8497090\/hive-regexp-extract-weirdness\/8497735#8497735<\/a><\/p>\n\n\n\n<p>hive udf\u51fd\u6570 array_except \u5b9e\u73b0<br><a href=\"https:\/\/blog.csdn.net\/qq_35515661\/article\/details\/130161544\">https:\/\/blog.csdn.net\/qq_35515661\/article\/details\/130161544<\/a><\/p>\n\n\n\n<p>Hive array_except GenericUDF\u7f16\u5199<br><a href=\"https:\/\/ixyzero.com\/blog\/archives\/5594.html\">https:\/\/ixyzero.com\/blog\/archives\/5594.html<\/a><\/p>\n\n\n\n<p>\u63d0\u53d6SQL\u4e2d\u7684limit\u9650\u5236\u6570\u503c<br><a href=\"https:\/\/ixyzero.com\/blog\/archives\/5428.html\">https:\/\/ixyzero.com\/blog\/archives\/5428.html<\/a><\/p>\n\n\n\n<p>\u6570\u7ec4\u51fd\u6570\u548c\u8fd0\u7b97\u7b26<br><a href=\"https:\/\/www.alibabacloud.com\/help\/zh\/sls\/user-guide\/array-functions-and-operators#section-h1n-rrz-7xn\">https:\/\/www.alibabacloud.com\/help\/zh\/sls\/user-guide\/array-functions-and-operators#section-h1n-rrz-7xn<\/a><\/p>\n\n\n\n<p>\u6b63\u5219\u8868\u8fbe\u5f0f\u5728\u524d\u9762\u6216\u8005\u540e\u9762\u4e0d\u60f3\u5339\u914d\u67d0\u5b57\u7b26\u4e32\u7684\u7279\u6b8a\u7528\u6cd5<br><a href=\"https:\/\/blog.csdn.net\/zh515858237\/article\/details\/113877648\">https:\/\/blog.csdn.net\/zh515858237\/article\/details\/113877648<\/a><\/p>\n\n\n\n<p>\u8fd1\u671f\u7684\u4e00\u4e9b\u96f6\u788e\u77e5\u8bc6\u70b9\u6574\u7406<br><a href=\"https:\/\/ixyzero.com\/blog\/archives\/5508.html\">https:\/\/ixyzero.com\/blog\/archives\/5508.html<\/a><\/p>\n\n\n\n<p>Unicode\u6c49\u5b57\u8303\u56f4[bak]<br><a href=\"https:\/\/ixyzero.com\/blog\/archives\/1524.html\">https:\/\/ixyzero.com\/blog\/archives\/1524.html<\/a><\/p>\n\n\n\n<p>=END=<\/p>\n","protected":false},"excerpt":{"rendered":"<p>=Start= \u7f18\u7531\uff1a \u4e4b\u524d\u6ca1\u600e\u4e48\u7528\u8fc7 regexp_extract_all \u8fd9\u4e2a\u51fd\u6570\uff0c\u7528\u7684\u6bd4\u8f83\u591a\u7684\u662f reg [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[14,23,7],"tags":[2115,1261,2019,2388,1262,19,217,2387],"class_list":["post-5804","post","type-post","status-publish","format-standard","hentry","category-database","category-knowledgebase-2","category-programing","tag-array_except","tag-hive","tag-regexp_extract","tag-regexp_extract_all","tag-sql","tag-tips","tag-unicode","tag-yyyymmdd"],"views":1803,"_links":{"self":[{"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/posts\/5804","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/comments?post=5804"}],"version-history":[{"count":1,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/posts\/5804\/revisions"}],"predecessor-version":[{"id":5805,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/posts\/5804\/revisions\/5805"}],"wp:attachment":[{"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/media?parent=5804"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/categories?post=5804"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/ixyzero.com\/blog\/wp-json\/wp\/v2\/tags?post=5804"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}