LINUX.ORG.RU

История изменений

Исправление arcanis, (текущая версия) :

Первый раз слышу про проблемы с парсингом имейла.

видимо, ты их не парсил руками. Вот тебе кусок старого кода на питоне, который парсит _только_ часть заголовков

# get mail text
try:
    with open(os.path.join(path, mail), 'r') as mail_file:
        attached_mail = email.message_from_file(mail_file)
except (IOError, TypeError):
    logging.error('Could not get data from {}'.format(mail), exc_info=True)
    continue
# to fields
rec_list = attached_mail.get_all('To', [])
bcc_list = attached_mail.get_all('Bcc', [])
cc_list = attached_mail.get_all('Cc', [])
recipients = ', '.join([addr[1] for addr in email.utils.getaddresses(rec_list + bcc_list + cc_list)])
if not recipients:
    # there is no any recipients
    # seems to be bug in the Google API, a mail named as 'Attachment'
    continue
# from fields
sender = ', '.join([addr[1] for addr in email.utils.getaddresses(attached_mail.get_all('From', []))])
# decode subject
subject, subj_encoding = email.Header.decode_header(attached_mail.get('Subject', ''))[0]
if subj_encoding is not None:
    try:
        subject = subject.decode(subj_encoding)
    except:
        # fuck you users which mail agents send invalid encoding
        pass
# date
date, date_encoding = email.Header.decode_header(attached_mail.get('Date', ''))[0]
except:
if date_encoding is not None:
    date = date.decode(date_encoding)
try:
    date = date_parser.parse(date)
except ValueError:
    # some clients send invalid TZ/datetime format, lets assign current date
    date = datetime.datetime.now()

но на бумажке выглядит просто, да

Исходная версия arcanis, :

Первый раз слышу про проблемы с парсингом имейла.

видимо, ты их не парсил руками. Вот тебе кусок старого кода на питоне, который парсит _только_ заголовки по сути

# get mail text
try:
    with open(os.path.join(path, mail), 'r') as mail_file:
        attached_mail = email.message_from_file(mail_file)
except (IOError, TypeError):
    logging.error('Could not get data from {}'.format(mail), exc_info=True)
    continue
# to fields
rec_list = attached_mail.get_all('To', [])
bcc_list = attached_mail.get_all('Bcc', [])
cc_list = attached_mail.get_all('Cc', [])
recipients = ', '.join([addr[1] for addr in email.utils.getaddresses(rec_list + bcc_list + cc_list)])
if not recipients:
    # there is no any recipients
    # seems to be bug in the Google API, a mail named as 'Attachment'
    continue
# from fields
sender = ', '.join([addr[1] for addr in email.utils.getaddresses(attached_mail.get_all('From', []))])
# decode subject
subject, subj_encoding = email.Header.decode_header(attached_mail.get('Subject', ''))[0]
if subj_encoding is not None:
    try:
        subject = subject.decode(subj_encoding)
    except:
        # fuck you users which mail agents send invalid encoding
        pass
# date
date, date_encoding = email.Header.decode_header(attached_mail.get('Date', ''))[0]
except:
if date_encoding is not None:
    date = date.decode(date_encoding)
try:
    date = date_parser.parse(date)
except ValueError:
    # some clients send invalid TZ/datetime format, lets assign current date
    date = datetime.datetime.now()

но на бумажке выглядит просто, да