From 2d1afefb1d1e773f48fbcfa142340036b9c0dec2 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Fri, 30 May 2014 16:46:54 -0700 Subject: [PATCH] irkerd: Handle UnicodeDecodeError in LineProtocol.data_received I just got the following in a message-of-the-day from leguin.freenode.net: Welcome to leguin.freenode.net in Ume\xe5, Sweden, EU! Where Ume\xe5 is Ume{U+00E5 LATIN SMALL LETTER A WITH RING ABOVE}. \xe5 is the ISO-8859-1 encoding. Since important messages from the IRC server should be in ASCII [1]: Regardless of being an 8-bit protocol, the delimiters and keywords are such that protocol is mostly usable from US-ASCII terminal and a telnet connection. So rather than trying some fancy charset-detection heuristics, just drop lines that don't decode properly. [1]: http://tools.ietf.org/html/rfc2812#section-2.2 --- irkerd | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/irkerd b/irkerd index d6686ff..9b1a63c 100755 --- a/irkerd +++ b/irkerd @@ -484,9 +484,14 @@ class LineProtocol(asyncio.Protocol): else: self.buffer = [] for line in lines: - line = str(line, self.encoding).strip() - LOG.debug('{}: line received: {!r}'.format(self, line)) - self.line_received(line=line) + try: + line = str(line, self.encoding).strip() + except UnicodeDecodeError as e: + LOG.warn('{}: invalid encoding in {!r} ({})'.format( + self, line, e)) + else: + LOG.debug('{}: line received: {!r}'.format(self, line)) + self.line_received(line=line) def datagram_received(self, data, addr): "Decode the raw bytes and pass the line to line_received" -- 2.26.2