LINUX.ORG.RU
ФорумAdmin

Ищется CLI перекодировщик кодировки для почты


0

0

В общем, хотим мы сделать нормальную Web морду для SurgeMail, ибо встроенная с русскими кодировками дружить не хочет.

Кто-нибудь когда-нибудь встречал утилиту, которая пробразует письма (base64, MIME, Quoted-printable) из одной кодировки в другую?

По сути, нужна утилита, которая распаковывает (base64, MIME, Quoted-printable) перекодирует из UTF-8, CP1251 и т.д. в нужную нам кодировку и запаковывает обратно.

Если есть иные варианты, я буду рад выслушать.

★★★★★

Одной не видел. Видел много разных:

Здесь краткий обзор
http://faqs.org.ru/internet/encode.htm

Вот по меньшей мере чаcть:
# rpm -ql sharutils-4.6.1-2 | grep bin
/usr/bin/compress-dummy
/usr/bin/mail-files
/usr/bin/mailshar
/usr/bin/remsync
/usr/bin/shar
/usr/bin/unshar
/usr/bin/uudecode
/usr/bin/uuencode

Valmont ★★★
()
Ответ на: комментарий от Valmont

Я имею ввиду, в одном оно таки не будет. Во всяком случае еще и с UTF8-CP1251. Идеолгия в unix другая. Либо использовать несколько утилит в обвязке icov/recode/libiconv, либо написать перловый скрипт, с ужным набором модулей и перекодировкой также заниматься.

Valmont ★★★
()

# cat uncode.pl
#!/usr/bin/perl

#-------------------------------------------
#
# This demonstrator program shows how the body of a message may be
# canonicalized by decoding any 'quoted-printable' or 'base64' portions
# before hashing (irrespective of whether they were present when the
# message was submitted or whether they were encoded en route because some
# non-8BITMIME or non-BINARYMIME MTA was encountered).
#
# This canonicalization is either an addition to the presently proposed
# 'relaxed' canonicalization for bodies (the ignoring of trailing
# whitespace and of trailing blank lines has been omitted from this
# demonstration for simplicity's sake), or else it could form the basis of
# some new '8bit-safe' canonicalization.
#
# The presence of the MIME media types 'multipart' and 'message' can bring
# about a tree of potentially unbounded depth, with the possibility of
# encountering encoded objects at any node. Hence a recursive descent of
# this tree is necessary.
#
# Little attempt has been made to check for mal-formed MIME structures,
# since they are going to fail elsewhere in the mail system anyway.
# However, a few cases which would cause the canonicalization to fail will
# give rise to fatal errors, and there are a couple of warnings for some
# odd cases that are technically illegal, but will not cause signatures to
# fail.
#
# Care has been taken to ensure that naked LFs do not occur outside of
# regions where the CTE is 'binary'. However, it is too much trouble to
# test for naked CRs; if present (whether in binary regions or not) they
# will pass through and get hashed like any other character.
#
#-------------------------------------------

use English;
use MIME::Head;
use MIME::QuotedPrint;
use MIME::Base64;
use Getopt::Long;

my $LONGLINE = ' ' x 1000;
$_ = $LONGLINE; # provide a buffer long enough for most lines;
# if it needs more, it will malloc it

GetOptions (
"unix|u" => \$unix,
"test|t" => \$test,
"noctedrop" => \$noctedrop,
"head" => \$keep_heads,
"quiet|q" => \$quiet,
"help|h" => \&usage
) or &usage;

sub usage {
print STDOUT
"usage: uncode.pl [options] [(infile|-) [outfile]]
-h --help print this message
-u --unix ensure lines end with CRLF, except within CTE binary;
for testing, or for MUAs running on Unix
-t --test no decoding; output should be identical to input,
modulo added CRs
--noctedrop do not replace Content-Transfer-Encodings;
for testing only
--head keep top-level headers
-q --quiet no warnings; recommended when verifying\n";
exit;
}

#-------------------------------------------
#
# Set up $in and $out handlers; by default, STDIN and STDOUT
# (this program is designed to operate in a pipeline).
#
my ($in, $out);
if (defined $ARGV[0]) {open($in, "<$ARGV[0]")}
else {$in = STDIN}
if (defined $ARGV[1]) {open($out, ">$ARGV[1]")}
else {$out = STDOUT}

#-------------------------------------------
#
# whine () for non-fatal problems
#
sub whine {warn(@_) unless $quiet};

#-------------------------------------------
#
# print_line ($line, $encoding)
#
# on $out, as per $encoding; the $line parameter is a Ref to the
# actual line, to avoid copying
#
sub print_line {
my ($line,$encoding) = @_;
local $buf = $LONGLINE;
if ($encoding eq 'quoted-printable') {
### the following messiness corects the misfeature of
### 'decode_qp' in Unix versions of Perl, which generates
### lines ending in a single LF; this necessitates
### detecting incoming lines ending in '='

$buf = decode_qp($$line);
if (substr($$line,-3) !~ m/=\r?\n$|^$/o)
{substr($buf,-1,1) = "\r\n"}
print $out $buf;
} elsif ($encoding eq 'base64') {
$buf = decode_base64($$line);
print $out $buf;
} else {
print $out $$line;
}
}

#-------------------------------------------
#
# read_chunk ($bound, $encoding)
#
# reads, and trascribes using the specified $encoding, up to the
# given MIME boundary, or up to EOF
#
sub read_chunk {
my ($bound,$encoding) = @_;

my $last = "";
my $eos_type = ""; # DELIM, CLOSE or EOF

while (<$in>) { # will read up to next '\n'
# which might be a long way away in
# genuine binary attachments, but should
# otherwise fit in 1000 octets
if ($encoding ne 'binary' && substr($_,-2,2) ne "\r\n") {
### if we find a line ending in naked LF when not in
### CTE 'binary' then in Unix mode we fix it,
### otherwise it is a fatal error

if ($unix) {substr($_,-1,1) = "\r\n"}
else {die "naked LF in non-binary region\n"}
}
if (substr($_,0,2) eq '--'
&& $_ =~ s/^(--$bound(--)?[ \t]*)\r?\n/\1\r\n/) {
### it was the expected boundary, and we note whether
### it is a CLOSEing boundary

$eos_type = ($2 ? 'CLOSE':'DELIM');

### we always print one line in arrear, since we do not
### want to print the boundary, which is never encoded
print_line(\$last, $encoding);
$last = $_;

return ($eos_type, $last);
### we return the boundary for printing later
} else {
print_line(\$last, $encoding);
$last = $_;
}
}
$eos_type = 'EOF';
print_line(\$last, $encoding);
return ($eos_type, '');
}

Somewho ★★
()
Ответ на: комментарий от Somewho

# Продолжение
#-------------------------------------------
#
# A map from encoding to integer for checking that a larger encoding
# is not contained within a smaller one.
# '' is a dummy encoding representing the initial state.
#
my %encodings = (''=>0, 'binary'=>1, '8bit'=>2, '7bit'=>3,
'quoted-printable'=>3, 'base64'=>3);

#-------------------------------------------
#
# process_header ($deftype, $old_encoding)
#
# Reads headers, both at the top level and at the start of multipart
# and message types. It reurns the Content-Type (defaulting to $deftype)
# and Content-Transfer-Encoding, $old-encoding will be '' at the
# top level.
#
sub process_header {
my ($deftype, $old_encoding) = @_;
my $encoding;

my $head = new MIME::Head;
$head->read($in);
my $mime_type = $head->mime_type($deftype);
$encoding = $head->mime_encoding;
my $bound = $head->multipart_boundary;

$head->replace('content-transfer-encoding', "binary\r")
unless ($test or $noctedrop);
### we don't know the original CTE, but anyway that CTE is no longer
### of interest; so we change it to 'binary' to make sure the
### 'naked LF' test is not triggered during verification

my $newstring = $head->as_string;
if ($unix) {$newstring =~ s/\r?\n/\r\n/og}
if ($old_encoding ne '' || $test || $keep_heads)
### i.e. not a top-level header, which should not be output
### because they are canonicalized separately
{ print $out $newstring, "\r\n" }
return ($mime_type, $encoding, $bound);
}

#-------------------------------------------
#
# process_part ($oldbound, $deftype, $old_encoding)
#
# process_part is the heart of the whole system. A 'part' is a part of
# a multipart, or a message type, or even the whole message. This
# routine walks through the tree of parts, calling itself recursively
# as needed.
#
sub process_part {
my ($oldbound, $deftype, $old_encoding) = @_;

my ($mime_type, $encoding, $bound) =
process_header($deftype, $old_encoding);
if ($encodings{$encoding} < $encodings{$old_encoding})
### you can't have 'binary' inside an '8bit' object, or
### '8bit' inside a '7bit' one
{whine "illegal $encoding within $old_encoding\n"}

my ($type, $subtype) = split('/', $mime_type);
my $eos_type;

if ($type eq 'multipart') {
### multipart
# determine the default type or the parts
my $retype = $subtype eq 'digest'?'message/rfc822':'text/plain';
my $more_parts = 1;

### parse preamble
# the preamble is not supposed to be displayed, but it
# still get hashed; it MUST have CTE '7bit', because
# there is no way to encode it
($eos_type, $last) = read_chunk($bound, "7bit");
die "boundary $bound not found\n" if ($eos_type eq 'EOF');
# an expected CLOSEing boundary was never found;
# a fatal error
if ($eos_type eq 'CLOSE')
{whine "zero parts in multipart\n"; $more_parts=0};
# print the boundary; boundaries MUST be ASCII because
# there is no way to encode them
print_line(\$last, "7bit");

### process parts
while ($more_parts) {
# the recursive call for each part within this
# multipart
($eos_type, $last) = process_part($bound, $retype, $encoding);
die "boundary $bound not found\n" if ($eos_type eq 'EOF');
$more_parts=0 if ($eos_type eq 'CLOSE');
# print the boundary of this part; we have
# already noted whether it is the CLOSEing
# boundary of this multipart
print_line(\$last, "7bit");
}

### process epilogue
# the epilogue also is not for display, but still gets
# hashed; we are now looking for the next boundary
# ($oldbound) withing our parent
($eos_type, $last) = read_chunk($oldbound, "7bit");

} elsif ($type eq 'message') {
### message
### all message types have the same structure, though only
### some of them, such as message/rfc822 and message/partial
### possess bodies which might need encoding;
### so, having already processed the header for the current
### part, we now encounter the header of the message proper
### - another recursive call; the mesage will be terminated
### by the boundary of the parent, hence the $oldbound
### parameter

($eos_type, $last) = process_part($oldbound, 'text.plain', $encoding);

} else {
### simple part
### this is a tip of the part tree, and the only place where
### 'quoted-printable' or 'base64' can occur or may need to
### be decoded (which we don't do if $test is set);
### if no decoding is needed, read_chunk still needs to
### distinguish between '7/8bit' and 'binary'

if ($test && $encoding ne 'binary' &&
($encoding eq 'quoted-printable' || $encoding eq 'base64')) {
($eos_type, $last) = read_chunk($oldbound, '7bit');
} else {
($eos_type, $last) = read_chunk($oldbound, $encoding);
}
}
return ($eos_type, $last);
}


#-------------------------------------------
#
# main program
#
# The three parameters represent:
# no boundary for any parent
# no default type (actually, it is text/plain, but process_header
# will tell us that
# dummy CTE representing initial state above top level
#
process_part('', '', '');

Somewho ★★
()
Вы не можете добавлять комментарии в эту тему. Тема перемещена в архив.