dev-libs/uchardet: verbump to 0.0.5 wrt bug 569110

author Ilya Tumaykin <itumaykin@gmail.com>

Mon, 25 Jan 2016 08:03:06 +0000 (11:03 +0300)

committer Ilya Tumaykin <itumaykin@gmail.com>

Mon, 25 Jan 2016 09:29:16 +0000 (12:29 +0300)
author Ilya Tumaykin <itumaykin@gmail.com>
Mon, 25 Jan 2016 08:03:06 +0000 (11:03 +0300)
committer Ilya Tumaykin <itumaykin@gmail.com>
Mon, 25 Jan 2016 09:29:16 +0000 (12:29 +0300)
diff --git a/dev-libs/uchardet/Manifest b/dev-libs/uchardet/Manifest

index 9652429b564b3a61bd15a6c4aadd0a1b2509d8e6..885fbccb4240917c1a1eb19680c49b826e8bdcdf 100644 (file)
--- a/dev-libs/uchardet/Manifest
+++ b/dev-libs/uchardet/Manifest
@@ -1 +1,2 @@
  DIST uchardet-0.0.1.tar.gz 179207 SHA256 e238c212350e07ebbe1961f8f128faaa40f71b70d37b63ffa2fe12c664269ee6 SHA512 28fa8618a1a5f8cd36271fee3dd0e0bfbd2fdd219ad026a1382db366806d57f5be33ad4dfd765a9c31408a853edde157c3aeb717dcce360e56c3f63f4a1f1288 WHIRLPOOL 413a754c1e976c3bc24b6b66150dc4544f0fc565953bd4c0b062b37afbf4cb1caa5e2c014388133d2f1559d59b563d8ba812482ed8a11088fcfab3353ef6b8a0
+DIST uchardet-0.0.5.tar.gz 222864 SHA256 7c5569c8ee1a129959347f5340655897e6a8f81ec3344de0012a243f868eabd1 SHA512 e32ff3e7baa9804199e3ca240ce590fed3fcb539fe4d780c4ec205fa5cbd45415e2c8c8db51d97965f9f9bbaad1f34613d5ed2849aafd9bbc3dda850c0be20ac WHIRLPOOL 737becbbf1be09e049207311c964ee61e78bce3c3cdc31cd5a071a52aef22b5f0d803a243aac8b0f9840c19d27ffbac3e08454ec7a74c2bb85f19f15333e3af6
diff --git a/dev-libs/uchardet/files/uchardet-0.0.5-fix-ASCII-detection.patch b/dev-libs/uchardet/files/uchardet-0.0.5-fix-ASCII-detection.patch

new file mode 100644 (file)

index 0000000..c82aee8
--- /dev/null
+++ b/dev-libs/uchardet/files/uchardet-0.0.5-fix-ASCII-detection.patch
@@ -0,0 +1,116 @@
+commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f
+Author: Jehan <jehan@girinstud.io>
+Date:   Sat Dec 5 21:04:20 2015 +0100
+
+    Nearly-ASCII text with NBSP is still not ASCII.
+    
+    There is no "exception" in encoding. The non-breaking space 0xA0 is not
+    ASCII, and therefore returning "ASCII" will later create issues (for
+    instance trying to re-encode with iconv produces an error).
+    This was obviously an explicit decision in original code (according to
+    code comments), probably tied to specifity of the original program from
+    Mozilla. Now we want strict detection.
+    I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only
+    exception" (note that I could have returned any ISO-8859 charsets since
+    they all have this character in common).
+
+diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
+index ab8bae0..ff06b9d 100644
+--- a/src/nsUniversalDetector.cpp
++++ b/src/nsUniversalDetector.cpp
+@@ -47,6 +47,7 @@
+
+ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
+ {
++  mNbspFound = PR_FALSE;
+   mDone = PR_FALSE;
+   mBestGuess = -1;   //illegal value as signal
+   mInTag = PR_FALSE;
+@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector()
+ void
+ nsUniversalDetector::Reset()
+ {
++  mNbspFound = PR_FALSE;
+   mDone = PR_FALSE;
+   mBestGuess = -1;   //illegal value as signal
+   mInTag = PR_FALSE;
+@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+   PRUint32 i;
+   for (i = 0; i < aLen; i++)
+   {
+-    /* Other than 0xA0, if every other character is ASCII, the page is ASCII.
++    /* If every other character is ASCII or 0xA0, we don't run charset
++     * probers.
+      * 0xA0 (NBSP in a few charset) is apparently a rare exception
+-     * of non-ASCII character contained in ASCII text. */
++     * of non-ASCII character often contained in nearly-ASCII text. */
+     if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')
+     {
+       /* We got a non-ASCII byte (high-byte) */
+@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+     }
+     else
+     {
+-      //ok, just pure ascii so far
+-      if ( ePureAscii == mInputState &&
+-        (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
++      /* Just pure ASCII or NBSP so far. */
++      if (aBuf[i] == '\xA0')
+       {
+-        //found escape character or HZ "~{"
++        /* ASCII with the only exception of NBSP seems quite common.
++         * I doubt it is really necessary to train a model here, so let's
++         * just make an exception.
++         */
++          mNbspFound = PR_TRUE;
++      }
++      else if (mInputState == ePureAscii &&
++               (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')))
++      {
++        /* We found an escape character or HZ "~{". */
+         mInputState = eEscAscii;
+       }
+       mLastChar = aBuf[i];
+@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+       mDone = PR_TRUE;
+       mDetectedCharset = mEscCharSetProber->GetCharSetName();
+     }
++    else if (mNbspFound)
++    {
++      mDetectedCharset = "ISO-8859-1";
++    }
+     else
+     {
+       /* ASCII with the ESC character (or the sequence "~{") is still
+@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+     break;
+
+   default:
+-    /* Pure ASCII */
+-    mDetectedCharset = "ASCII";
++    if (mNbspFound)
++    {
++      /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
++       * (though it could have been any ISO-8859 encoding). */
++      mDetectedCharset = "ISO-8859-1";
++    }
++    else
++    {
++      /* Pure ASCII */
++      mDetectedCharset = "ASCII";
++    }
+     break;
+   }
+   return NS_OK;
+diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
+index 4d9b460..9f0a4b1 100644
+--- a/src/nsUniversalDetector.h
++++ b/src/nsUniversalDetector.h
+@@ -72,6 +72,7 @@ protected:
+    virtual void Report(const char* aCharset) = 0;
+    virtual void Reset();
+    nsInputState  mInputState;
++   PRBool  mNbspFound;
+    PRBool  mDone;
+    PRBool  mInTag;
+    PRBool  mStart;
diff --git a/dev-libs/uchardet/files/uchardet-0.0.5-use-proper-package-name.patch b/dev-libs/uchardet/files/uchardet-0.0.5-use-proper-package-name.patch

new file mode 100644 (file)

index 0000000..b1ed889
--- /dev/null
+++ b/dev-libs/uchardet/files/uchardet-0.0.5-use-proper-package-name.patch
@@ -0,0 +1,30 @@
+commit b6d872bbec3be7abfccbdfd3d90e784cf7281c55
+Author: Jehan <jehan@girinstud.io>
+Date:   Tue Dec 15 21:40:16 2015 +0100
+
+    app: package name wrong in CMakeLists.txt.
+    
+    Probably coming from a copy-paste error when the build system was
+    originally created.
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 0b65c49..4f279e1 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1,6 +1,6 @@
+ ######## Project settings
+ cmake_minimum_required(VERSION 2.8)
+-set (PACKAGE_NAME opencc)
++set (PACKAGE_NAME uchardet)
+ project (${PACKAGE_NAME} CXX C)
+ enable_testing()
+
+@@ -54,7 +54,7 @@ if (DEFINED SYSCONF_INSTALL_DIR)
+       set (DIR_ETC ${SYSCONF_INSTALL_DIR})
+ endif (DEFINED SYSCONF_INSTALL_DIR)
+
+-set (DIR_SHARE_UCHARDET ${DIR_SHARE}/opencc)
++set (DIR_SHARE_UCHARDET ${DIR_SHARE}/uchardet)
+ set (DIR_SHARE_LOCALE ${DIR_SHARE}/locale)
+
+ ######## Configuration
diff --git a/dev-libs/uchardet/uchardet-0.0.5.ebuild b/dev-libs/uchardet/uchardet-0.0.5.ebuild

new file mode 100644 (file)

index 0000000..1c0d57e
--- /dev/null
+++ b/dev-libs/uchardet/uchardet-0.0.5.ebuild
@@ -0,0 +1,33 @@
+# Copyright 1999-2016 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+# $Id$
+
+EAPI=5
+
+inherit cmake-utils
+
+DESCRIPTION="An encoding detector library ported from Mozilla"
+HOMEPAGE="https://github.com/BYVoid/uchardet"
+SRC_URI="https://github.com/BYVoid/${PN}/archive/v${PV}.tar.gz -> ${P}.tar.gz"
+
+LICENSE="MPL-1.1"
+SLOT="0"
+KEYWORDS="~amd64 ~x86"
+IUSE="static-libs test"
+
+PATCHES=(
+       "${FILESDIR}/${P}-fix-ASCII-detection.patch"
+       "${FILESDIR}/${P}-use-proper-package-name.patch"
+)
+
+src_prepare() {
+       use test || comment_add_subdirectory test
+       cmake-utils_src_prepare
+}
+
+src_configure() {
+       local mycmakeargs=(
+               $(cmake-utils_use_build static-libs STATIC)
+       )
+       cmake-utils_src_configure
+}
author	Ilya Tumaykin <itumaykin@gmail.com>
	Mon, 25 Jan 2016 08:03:06 +0000 (11:03 +0300)
committer	Ilya Tumaykin <itumaykin@gmail.com>
	Mon, 25 Jan 2016 09:29:16 +0000 (12:29 +0300)
dev-libs/uchardet/Manifest		patch \| blob \| history
dev-libs/uchardet/files/uchardet-0.0.5-fix-ASCII-detection.patch	[new file with mode: 0644]	patch \| blob
dev-libs/uchardet/files/uchardet-0.0.5-use-proper-package-name.patch	[new file with mode: 0644]	patch \| blob
dev-libs/uchardet/uchardet-0.0.5.ebuild	[new file with mode: 0644]	patch \| blob