convert csv-files to UTF-8 #63

This commit is contained in:
JHCD 2015-07-14 16:49:14 +02:00
parent e829778db2
commit f3db4635c7
2 changed files with 80 additions and 17 deletions

View file

@ -14,6 +14,7 @@ import logging # Global logger
import csv # for loading the description files
from includes import globals # Global variables
from includes.helper import uft8Converter # UTF-8 converter
##
@ -37,25 +38,11 @@ def loadCSV(typ, idField):
logging.debug(row)
# only import rows with an integer as id
if row[idField].isdigit() == True:
# check if string contains non-utf8 characters
description = ""
try:
description = row['description'].decode('UTF-8', 'strict')
except UnicodeDecodeError:
# line contains non-utf8 character
logging.debug("row contains non-utf8 characters: %s", row['description'])
# try to find out codec:
encodings = ('windows-1250', 'windows-1252', 'iso-8859-1', 'iso-8859-15')
for enc in encodings:
try:
description = f.decode(enc)
break
except Exception:
pass
# encode in UTF-8
description = description.encode('UTF-8')
resultList[row[idField]] = uft8Converter.convertToUTF8(row['description'])
except:
# skip entry in case of an exception
pass
resultList[row[idField]] = description
logging.debug("-- loading csv finished")
except:
logging.error("loading csvList for typ: %s failed", typ)

View file

@ -0,0 +1,76 @@
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#
"""
little Helper for converting strings
@author: Jens Herrmann
"""
import logging
def convertToUTF8(string = ""):
"""
Returns given string in UTF-8
@type string: String
@param string: String to convert to UTF-8
@return: string in UTF-8
@exception: Exception if converting to UTF-8 failed
"""
uft8String = ""
# nothing to do if string is empty
if len(string) > 0:
try:
# check given string is already UTF-8, return
return string.decode('UTF-8', 'strict')
except UnicodeDecodeError:
# string contains non-UTF-8 character
logging.debug("string contains non-UTF-8 characters: %s", string)
# try to find out encoding:
encodings = ('windows-1250', 'windows-1252', 'latin_1', 'cp850', 'cp852', 'iso8859_2', 'iso8859_15', 'mac_latin2', 'mac_roman')
for enc in encodings:
try:
string = string.decode(enc)
logging.debug("string was encoded in: %s", enc)
break
except Exception:
# if exception for last encoding entry fail, raise exception
if enc == encodings[-1]:
logging.warning("no encoding found")
logging.debug("no encoding found", exc_info=True)
# no fixing possible, raise exception
raise
pass
# string should now decoded...
try:
# encode decoded string to UTF-8
uft8String = string.encode('UTF-8')
except:
logging.warning("encoding to UTF-8 failed")
logging.debug("encoding to UTF-8 failed", exc_info=True)
# no fixing possible, raise exception
raise
# Now we must have an utf8-string, check it:
try:
uft8String.decode('UTF-8', 'strict')
logging.debug("string converting succeeded: %s", uft8String)
except:
logging.warning("converting to UTF-8 failed")
logging.debug("converting to UTF-8 failed", exc_info=True)
# no fixing possible, raise exception
raise
# End of exception: check given string is already UTF-8
pass
return uft8String