Amazon S3に大量のファイルを超高速にアップロードする方法(Perlで並列アップロード)
使い方
1.下記cpanモジュールが必要です。
- Amazon::S3
- Paralell::ForkManager
2.環境変数をセットしておきます。
vi ~/.aws
export AWS_ACCESS_KEY_ID=hogehoge
export AWS_SECRET_ACCESS_KEY=fugafuga
source ~/.aws
3.実行する
perl upload.pl /foo/bar/mydir/ s3://bucketname/foo/bar/mydir/ -p 10
-pオプションで並列実行数を指定します。
あまり大きくするとマシンがフリーズするのでご注意ください。
小さい値から徐々に試していくのがよいでしょう。
なお、-p 1 の場合は並列化しません。
ベンチマーク
ニフティクラウドのMediumサーバから、529枚の画像をAmazon S3にアップロードした結果を計測しました。s3cmd put --recursive | 72.3秒 |
upload.pl 並列なし | 63.0秒 |
upload.pl 並列 2 | 43.6秒 |
upload.pl 並列 5 | 22.5秒 |
upload.pl 並列 10 | 13.6秒 |
upload.pl 並列 15 | 11.6秒 |
upload.pl 並列 20 | 11.7秒 |
upload.pl 並列 30 | 11.9秒 |
このケースでは15並列が最も速かったです。
環境(画像の大きさや数、マシンのスペック、ネットワーク速度)によって、最適な並列数は変わってくると思います。
いろいろ試してみてください。
ソースコード
モダンPerlじゃなくてすいません。upload.pl
#!/usr/bin/perl
package Amazon::S3::Uploader::File;
use strict;
use warnings;
use base qw( Class::Accessor );
__PACKAGE__->mk_accessors( qw(local_path remote_dir bucket ) );
sub new {
my $class = shift;
my $path = shift;
my $remote_dir = shift;
my $bucket = shift;
bless {
local_path => $path,
remote_dir => $remote_dir,
bucket => $bucket,
}, $class;
}
sub upload {
my $self = shift;
my $bucket = $self->{bucket};
$bucket->add_key_filename($self->remote_key, $self->local_path)
or die "canno upload file " . $self->from_to;
}
sub from_to {
my $self = shift;
return $self->local_path . " -> " . $self->remote_key;
}
sub remote_path {
my $self = shift;
my $local_path = $self->{local_path};
$local_path =~ s|^\./||;
return $self->remote_dir . $local_path;
}
sub remote_key {
my $self = shift;
my $remote_path = $self->remote_path;
$remote_path =~ s|^s3\://[0-9a-z\-]+/||i;
$remote_path;
}
package Amazon::S3::Uploader;
use strict;
use warnings;
use File::Find;
use Amazon::S3;
use Parallel::ForkManager;
our $verbose;
sub upload {
my $local_dir = shift;
my $remote_dir = shift;
my $max_process = shift;
my $config = shift;
my $s3 = Amazon::S3->new({
aws_access_key_id => $config->{aws_access_key_id},
aws_secret_access_key => $config->{aws_secret_access_key},
});
my ($bucket_name) = ( $remote_dir =~ /s3:\/\/([^\/]+)\// );
my $bucket = $s3->bucket($bucket_name) or die 'cannot get bucket';
_print("local dir : " . $local_dir . "\n");
_print("remote dir : " . $remote_dir . "\n");
_print("max process: " . $max_process . "\n");
my @local_files;
my $callback = sub {
return unless -f ;
my $file = Amazon::S3::Uploader::File->new($File::Find::name, $remote_dir, $bucket);
push @local_files , $file;
};
find($callback, '.');
if ($max_process > 1) {
upload_files_parallel(\@local_files, $max_process);
} else {
upload_files_single(\@local_files);
}
}
sub upload_files_single {
my @files = @{ shift; };
_print("uploading by a single process\n");
my $i = 0;
my $total_num = @files;
for my $file (@files) {
$i++;
$file->upload();
_print("ok $i / $total_num " . $file->from_to . "\n");
}
_print(sprintf("%d files uploaded\n" , $i));
}
sub upload_files_parallel {
my @files = @{ shift; };
my $max = shift;
_print("uploading by multi processes\n");
my $pm = new Parallel::ForkManager($max);
$pm->run_on_finish(
sub {
my ($pid, $exit_code, $ident) = @_;
if ($exit_code != 0) {
die('error !');
}
});
my $i = 0;
my $total_num = @files;
for my $file (@files) {
$i++;
$pm->start and next;
$file->upload();
_print("ok $i / $total_num " . $file->from_to . "\n");
$pm->finish;
$i++;
}
$pm->wait_all_children;
my $count = @files;
_print(sprintf("%d files uploaded\n" , $count));
}
sub _print {
return unless $verbose;
print @_;
}
package main;
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
my %opts = (
max_process => 1,
);
GetOptions(
\%opts,
'verbose',
'process=i',
'help',
) or $opts{help}++;
pod2usage(2) if $opts{help};
my $local_dir = shift;
my $remote_dir = shift;
my $max_process = $opts{process};
die "no such directory $local_dir" if ! -d $local_dir;
$max_process = 1 unless $max_process;
my $config = {
aws_access_key_id => $ENV{AWS_ACCESS_KEY_ID},
aws_secret_access_key => $ENV{AWS_SECRET_ACCESS_KEY},
};
chdir $local_dir;
$Amazon::S3::Uploader::verbose = $opts{verbose};
Amazon::S3::Uploader::upload($local_dir, $remote_dir, $max_process, $config);
=head1 SYNOPSIS
-v verbose
-p max process (default 1 if ommitted)
-h help (this)
example:
upload.pl /path/to/dir/ s3://bucketname/foo/bar/dir/
=cut