top of page

AWS ETHENA
 

1. Athena , select S3 data from Athena
​
Copy S3 URL by Go to VPC -> Click VPC ID -> Flow logs -> Destination name -> AWSsLogs -> AccountID -> vpcflowlogs -> us-east-1
​
 
Create table

CREATE EXTERNAL TABLE `vpc_flow_logs`(

      `version` int, 

      `account_id` string, 

      `interface_id` string, 

      `srcaddr` string, 

      `dstaddr` string, 

      `srcport` int, 

      `dstport` int, 

      `protocol` bigint, 

      `packets` bigint, 

      `bytes` bigint, 

      `start` bigint, 

      `end` bigint, 

      `action` string, 

      `log_status` string, 

      `vpc_id` string, 

      `subnet_id` string, 

      `instance_id` string, 

      `tcp_flags` int, 

      `type` string, 

      `pkt_srcaddr` string, 

      `pkt_dstaddr` string, 

      `region` string, 

      `az_id` string, 

      `sublocation_type` string, 

      `sublocation_id` string, 

      `pkt_src_aws_service` string, 

      `pkt_dst_aws_service` string, 

      `flow_direction` string, 

      `traffic_path` int)

    PARTITIONED BY ( 

      `date` date)

    ROW FORMAT DELIMITED 

      FIELDS TERMINATED BY ' ' 

    STORED AS INPUTFORMAT 

      'org.apache.hadoop.mapred.TextInputFormat' 

    OUTPUTFORMAT 

      'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'

    LOCATION

      's3://ais-vpcflow-logs-{{accountID}}/AWSLogs/{{AccountID}}/vpcflowlogs/us-east-1'

    TBLPROPERTIES (

      'skip.header.line.count'='1', 

      'transient_lastDdlTime'='1670359996')

Add partition

 

ALTER TABLE vpc_flow_logs ADD PARTITION (date='2022-12-06')

    LOCATION 's3://package--logs-xxxxx/AWSLogs/xxxx/flowlogs/us-east-1/2022/12/06'

​

    ALTER TABLE vpc_flow_logs drop PARTITION (date='2022-12-06')


This is because MSCK REPAIR TABLE foos supports only partitioning with key-value pairs in file paths, such as:


MSCK REPAIR TABLE vpc_flow_logs

Query to check data


 

SELECT * 

    FROM vpc_flow_logs 

    WHERE interface_id = 'eni-0fef3c35xxx'

    LIMIT 100;

flogs: 4; refjected


 

SELECT *  FROM vpc_flow_logs 

    where tcp_flags=4 and srcport=1711 and srcadd='x.x.x.x'

    LIMIT 100;

List all partitios;

show partitions vpc_flow_logs

bottom of page