From f3db4635c78ac61d40016277192dafe61a3f1023 Mon Sep 17 00:00:00 2001 From: JHCD Date: Tue, 14 Jul 2015 16:49:14 +0200 Subject: [PATCH] convert csv-files to UTF-8 #63 --- includes/descriptionList.py | 21 ++------- includes/helper/uft8Converter.py | 76 ++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 17 deletions(-) create mode 100644 includes/helper/uft8Converter.py diff --git a/includes/descriptionList.py b/includes/descriptionList.py index 3e7cb7f..fac53cc 100644 --- a/includes/descriptionList.py +++ b/includes/descriptionList.py @@ -14,6 +14,7 @@ import logging # Global logger import csv # for loading the description files from includes import globals # Global variables +from includes.helper import uft8Converter # UTF-8 converter ## @@ -37,25 +38,11 @@ def loadCSV(typ, idField): logging.debug(row) # only import rows with an integer as id if row[idField].isdigit() == True: - # check if string contains non-utf8 characters - description = "" try: - description = row['description'].decode('UTF-8', 'strict') - except UnicodeDecodeError: - # line contains non-utf8 character - logging.debug("row contains non-utf8 characters: %s", row['description']) - # try to find out codec: - encodings = ('windows-1250', 'windows-1252', 'iso-8859-1', 'iso-8859-15') - for enc in encodings: - try: - description = f.decode(enc) - break - except Exception: - pass - # encode in UTF-8 - description = description.encode('UTF-8') + resultList[row[idField]] = uft8Converter.convertToUTF8(row['description']) + except: + # skip entry in case of an exception pass - resultList[row[idField]] = description logging.debug("-- loading csv finished") except: logging.error("loading csvList for typ: %s failed", typ) diff --git a/includes/helper/uft8Converter.py b/includes/helper/uft8Converter.py new file mode 100644 index 0000000..16fee3e --- /dev/null +++ b/includes/helper/uft8Converter.py @@ -0,0 +1,76 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +# + +""" +little Helper for converting strings + +@author: Jens Herrmann +""" + +import logging + + +def convertToUTF8(string = ""): + """ + Returns given string in UTF-8 + + @type string: String + @param string: String to convert to UTF-8 + + @return: string in UTF-8 + @exception: Exception if converting to UTF-8 failed + """ + + uft8String = "" + + # nothing to do if string is empty + if len(string) > 0: + try: + # check given string is already UTF-8, return + return string.decode('UTF-8', 'strict') + except UnicodeDecodeError: + # string contains non-UTF-8 character + logging.debug("string contains non-UTF-8 characters: %s", string) + + # try to find out encoding: + encodings = ('windows-1250', 'windows-1252', 'latin_1', 'cp850', 'cp852', 'iso8859_2', 'iso8859_15', 'mac_latin2', 'mac_roman') + for enc in encodings: + try: + string = string.decode(enc) + logging.debug("string was encoded in: %s", enc) + break + except Exception: + # if exception for last encoding entry fail, raise exception + if enc == encodings[-1]: + logging.warning("no encoding found") + logging.debug("no encoding found", exc_info=True) + # no fixing possible, raise exception + raise + pass + + # string should now decoded... + + try: + # encode decoded string to UTF-8 + uft8String = string.encode('UTF-8') + except: + logging.warning("encoding to UTF-8 failed") + logging.debug("encoding to UTF-8 failed", exc_info=True) + # no fixing possible, raise exception + raise + + # Now we must have an utf8-string, check it: + try: + uft8String.decode('UTF-8', 'strict') + logging.debug("string converting succeeded: %s", uft8String) + except: + logging.warning("converting to UTF-8 failed") + logging.debug("converting to UTF-8 failed", exc_info=True) + # no fixing possible, raise exception + raise + + # End of exception: check given string is already UTF-8 + pass + + return uft8String \ No newline at end of file