I had a task the other day where I had 110GB of compressed log files and wanted to import into Impala (Cloudera). Currently, Impala does not support compressed files so I had to decompress them all. I created this handy script and thought you might find it useful. I mounted the EC2 bucket using s3fs I mentioned in my earlier post.
#!/bin/bash # Utils elapsed() { (( seconds = SECONDS )) "$@" (( seconds = SECONDS - seconds )) (( etime_seconds = seconds % 60 )) (( etime_minuts = ( seconds - etime_seconds ) / 60 % 60 )) (( etime_hours = seconds / 3600 )) (( verif = etime_seconds + (etime_minuts * 60) + (etime_hours * 3600) )) echo "Elapsed time: ${etime_hours}h ${etime_minuts}m ${etime_seconds}s" } convert() { # Remove the .gz extention from the compressed file name UFILE=`echo ${FILE:0:${#FILE}-3}` # Decompress gz file sudo -u hdfs hdfs dfs -cat /user/hdfs/oms/logs/$FILE | \ sudo -u hdfs gunzip -d | sudo -u hdfs hdfs dfs -put - /user/hdfs/oms/logs/$UFILE # Discard original gz file sudo -u hdfs hdfs dfs -rm -skipTrash /user/hdfs/oms/logs/$FILE sudo -u hdfs hdfs dfs -ls /user/hdfs/oms/logs/$UFILE } for FILE in `ls /media/ephemeral0/logs/` do elapsed convert $FILE echo "Decompressed $FILE to $UFILE on hdfs" done exit 0