Hive SQL日期字符串规范化


=Start=

缘由:

简单整理一下个人觉得有用/通用的SQL代码片段,方便后面有需要的时候参考和使用。

正文:

参考解答:
/*
日期字符串统一规范化成【yyyy-MM-dd HH:mm:ss】格式的一方面是看起来方便,另一方面是可以使用 unix_timestamp() 函数转换成 unix_timestamp 时间戳的数字形式,方便后面的比对
*/

datetime_str
#length=19 (2023-10-17 10:18:48) (一般自研的系统的日志格式都是这样,是不需要额外处理的) #OK
#length=20 (Oct 18 2023 02:04:07) (Cisco VPN) #需要转换
#length=24 (2023-10-17T10:20:15+0800) (Cisco AP) #需要处理

将上面各日志中的日期时间字符串转换成 unix_timestamp 时间戳的数字形式,方便后面的比对

,unix_timestamp('2022-11-16 00:00:00') as ts_bigint3 -- 1668528000

将日期字符串转换成unix时间戳,Hive SQL中只有 unix_timestamp() 函数,而且(默认)只接受【yyyy-MM-dd HH:mm:ss】这种格式,不是这种格式的我们需要手动转换然后再去调用函数,否则效果可能不及预期。

with t as (
  select
  '2023-10-18 02:04:07' as datetime1
  ,'Oct 18 2023 02:04:07' as datetime2
  ,'2023-10-17T10:20:15+0800' as datetime3
  ,cast('2023-10-18 02:04:07' as timestamp) as timestamp1 -- 2023-10-18 02:04:07.0
  ,cast('Oct 18 2023 02:04:07' as timestamp) as timestamp2 -- null
  ,cast('2023-10-17T10:20:15+0800' as timestamp) as timestamp3 -- null
)

select
datetime1,datetime2,datetime3,timestamp1,timestamp2,timestamp3
,date_format(timestamp1,'yyyy-MM-dd HH:mm:ss') as ts1
,date_format(timestamp2,'yyyy-MM-dd HH:mm:ss') as ts2 -- null
,date_format(timestamp3,'yyyy-MM-dd HH:mm:ss') as ts3 -- null
from
t
;



-- Hive SQL处理非【yyyy-MM-dd HH:mm:ss】格式的日期字符串的方法
-- 笨办法,但效果其实挺好
,case
    when (datetime_str like 'Jan%') then concat(substr(datetime_str,8,4),'-01-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Feb%') then concat(substr(datetime_str,8,4),'-02-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Mar%') then concat(substr(datetime_str,8,4),'-03-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Apr%') then concat(substr(datetime_str,8,4),'-04-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'May%') then concat(substr(datetime_str,8,4),'-05-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Jun%') then concat(substr(datetime_str,8,4),'-06-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Jul%') then concat(substr(datetime_str,8,4),'-07-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Aug%') then concat(substr(datetime_str,8,4),'-08-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Sep%') then concat(substr(datetime_str,8,4),'-09-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Oct%') then concat(substr(datetime_str,8,4),'-10-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Nov%') then concat(substr(datetime_str,8,4),'-11-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Dec%') then concat(substr(datetime_str,8,4),'-12-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
else datetime_str end as time_str


'Oct 18 2023 02:04:07' as datetime_str
,substr(datetime_str,8,4) as s_year
,substr(datetime_str,1,3) as s_month
,substr(datetime_str,5,2) as s_day
,substr(datetime_str,13,8) as s_hhmmss
,concat(substr(datetime_str,8,4),'-mm-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8)) as yyyymmdd_hhmmss


with t as (
  select
  '2023-10-18 02:04:07' as datetime1
  ,'Oct 18 2023 02:04:07' as datetime_str
  ,'2023-10-17T10:20:15+0800' as datetime3
)

select
datetime_str
,substr(datetime_str,8,4) as s_year
,substr(datetime_str,1,3) as s_month
,substr(datetime_str,5,2) as s_day
,substr(datetime_str,13,8) as s_hhmmss
,case
    when (datetime_str like 'Jan%') then concat(substr(datetime_str,8,4),'-01-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Feb%') then concat(substr(datetime_str,8,4),'-02-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Mar%') then concat(substr(datetime_str,8,4),'-03-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Apr%') then concat(substr(datetime_str,8,4),'-04-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'May%') then concat(substr(datetime_str,8,4),'-05-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Jun%') then concat(substr(datetime_str,8,4),'-06-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Jul%') then concat(substr(datetime_str,8,4),'-07-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Aug%') then concat(substr(datetime_str,8,4),'-08-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Sep%') then concat(substr(datetime_str,8,4),'-09-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Oct%') then concat(substr(datetime_str,8,4),'-10-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Nov%') then concat(substr(datetime_str,8,4),'-11-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Dec%') then concat(substr(datetime_str,8,4),'-12-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))

    when (datetime_str like '%T%+0800') then concat(substr(datetime_str,1,10),' ',substr(datetime_str,12,8))

    -- when (length(datetime_str)=19) then datetime_str
else datetime_str end as time_str

,case
    when (datetime3 like '%T%+0800') then concat(substr(datetime3,1,10),' ',substr(datetime3,12,8))
else datetime3 end as time_str2
from
t
;


-- 创建一个宏用于处理非 yyyymmdd-HHMMSS 格式的日期时间字符串
create temporary macro timestr_format(datetime_str string) case
    when (datetime_str like 'Jan%') then concat(substr(datetime_str,8,4),'-01-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Feb%') then concat(substr(datetime_str,8,4),'-02-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Mar%') then concat(substr(datetime_str,8,4),'-03-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Apr%') then concat(substr(datetime_str,8,4),'-04-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'May%') then concat(substr(datetime_str,8,4),'-05-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Jun%') then concat(substr(datetime_str,8,4),'-06-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Jul%') then concat(substr(datetime_str,8,4),'-07-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Aug%') then concat(substr(datetime_str,8,4),'-08-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Sep%') then concat(substr(datetime_str,8,4),'-09-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Oct%') then concat(substr(datetime_str,8,4),'-10-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Nov%') then concat(substr(datetime_str,8,4),'-11-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like 'Dec%') then concat(substr(datetime_str,8,4),'-12-',substr(datetime_str,5,2),' ',substr(datetime_str,13,8))
    when (datetime_str like '%T%+0800') then concat(substr(datetime_str,1,10),' ',substr(datetime_str,12,8))
else datetime_str end;

select
timestr_format('Oct 18 2023 02:04:07')
,timestr_format('2023-10-17T10:20:15+0800')
,timestr_format('2023-10-18 02:14:07')
;

其实 unix_timestamp() 函数是可以接收2个参数的,第2个参数指定要转换的第1个参数的字符串格式,之前测过,但好久没用,就又忘了……

select
'Jan 09 2023 15:30:21' as input
,unix_timestamp('Jan 09 2023 15:30:21', 'MMM dd yyyy HH:mm:ss') as parsed
;
-- input    parsed  
-- Jan 09 2023 15:30:21    1673249421


unix_timestamp(string date, string pattern)

Convert time string with given pattern to Unix time stamp (in seconds), return 0 if fail.
在第二个参数中指定要转换的字符串格式,然后将第一个参数的时间日期字符串转换成以秒为格式的unix时间戳数字,当转换失败时返回0。
参考链接:

一些SQL代码片段整理
https://ixyzero.com/blog/archives/5538.html

Hive SQL中的日期时间处理
https://ixyzero.com/blog/archives/5362.html

Hive SQL用宏实现常用功能
https://ixyzero.com/blog/archives/5380.html

=END=


《“Hive SQL日期字符串规范化”》 有 1 条评论

  1. “`
    select
    ’11/Apr/2024:21:21:21 +0800′ as input
    ,unix_timestamp(’11/Apr/2024:21:21:21 +0800′, ‘dd/MMM/yyyy:HH:mm:ss +0800’) as parsed
    ;

    unix_timestamp(string date, string pattern)
    “`

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注