AWS ETHENA
1. Athena , select S3 data from Athena
​
Copy S3 URL by Go to VPC -> Click VPC ID -> Flow logs -> Destination name -> AWSsLogs -> AccountID -> vpcflowlogs -> us-east-1
​
Create table
CREATE EXTERNAL TABLE `vpc_flow_logs`(
`version` int,
`account_id` string,
`interface_id` string,
`srcaddr` string,
`dstaddr` string,
`srcport` int,
`dstport` int,
`protocol` bigint,
`packets` bigint,
`bytes` bigint,
`start` bigint,
`end` bigint,
`action` string,
`log_status` string,
`vpc_id` string,
`subnet_id` string,
`instance_id` string,
`tcp_flags` int,
`type` string,
`pkt_srcaddr` string,
`pkt_dstaddr` string,
`region` string,
`az_id` string,
`sublocation_type` string,
`sublocation_id` string,
`pkt_src_aws_service` string,
`pkt_dst_aws_service` string,
`flow_direction` string,
`traffic_path` int)
PARTITIONED BY (
`date` date)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ' '
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://ais-vpcflow-logs-{{accountID}}/AWSLogs/{{AccountID}}/vpcflowlogs/us-east-1'
TBLPROPERTIES (
'skip.header.line.count'='1',
'transient_lastDdlTime'='1670359996')
Add partition
ALTER TABLE vpc_flow_logs ADD PARTITION (date='2022-12-06')
LOCATION 's3://package--logs-xxxxx/AWSLogs/xxxx/flowlogs/us-east-1/2022/12/06'
​
ALTER TABLE vpc_flow_logs drop PARTITION (date='2022-12-06')
This is because MSCK REPAIR TABLE foos supports only partitioning with key-value pairs in file paths, such as:
MSCK REPAIR TABLE vpc_flow_logs
Query to check data
SELECT *
FROM vpc_flow_logs
WHERE interface_id = 'eni-0fef3c35xxx'
LIMIT 100;
flogs: 4; refjected
SELECT * FROM vpc_flow_logs
where tcp_flags=4 and srcport=1711 and srcadd='x.x.x.x'
LIMIT 100;
List all partitios;
show partitions vpc_flow_logs