mirror of
https://github.com/Schrolli91/BOSWatch.git
synced 2026-01-20 15:20:16 +01:00
convert csv-files to UTF-8 #63
This commit is contained in:
parent
e829778db2
commit
f3db4635c7
|
|
@ -14,6 +14,7 @@ import logging # Global logger
|
|||
import csv # for loading the description files
|
||||
|
||||
from includes import globals # Global variables
|
||||
from includes.helper import uft8Converter # UTF-8 converter
|
||||
|
||||
|
||||
##
|
||||
|
|
@ -37,25 +38,11 @@ def loadCSV(typ, idField):
|
|||
logging.debug(row)
|
||||
# only import rows with an integer as id
|
||||
if row[idField].isdigit() == True:
|
||||
# check if string contains non-utf8 characters
|
||||
description = ""
|
||||
try:
|
||||
description = row['description'].decode('UTF-8', 'strict')
|
||||
except UnicodeDecodeError:
|
||||
# line contains non-utf8 character
|
||||
logging.debug("row contains non-utf8 characters: %s", row['description'])
|
||||
# try to find out codec:
|
||||
encodings = ('windows-1250', 'windows-1252', 'iso-8859-1', 'iso-8859-15')
|
||||
for enc in encodings:
|
||||
try:
|
||||
description = f.decode(enc)
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
# encode in UTF-8
|
||||
description = description.encode('UTF-8')
|
||||
resultList[row[idField]] = uft8Converter.convertToUTF8(row['description'])
|
||||
except:
|
||||
# skip entry in case of an exception
|
||||
pass
|
||||
resultList[row[idField]] = description
|
||||
logging.debug("-- loading csv finished")
|
||||
except:
|
||||
logging.error("loading csvList for typ: %s failed", typ)
|
||||
|
|
|
|||
76
includes/helper/uft8Converter.py
Normal file
76
includes/helper/uft8Converter.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: UTF-8 -*-
|
||||
#
|
||||
|
||||
"""
|
||||
little Helper for converting strings
|
||||
|
||||
@author: Jens Herrmann
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
|
||||
def convertToUTF8(string = ""):
|
||||
"""
|
||||
Returns given string in UTF-8
|
||||
|
||||
@type string: String
|
||||
@param string: String to convert to UTF-8
|
||||
|
||||
@return: string in UTF-8
|
||||
@exception: Exception if converting to UTF-8 failed
|
||||
"""
|
||||
|
||||
uft8String = ""
|
||||
|
||||
# nothing to do if string is empty
|
||||
if len(string) > 0:
|
||||
try:
|
||||
# check given string is already UTF-8, return
|
||||
return string.decode('UTF-8', 'strict')
|
||||
except UnicodeDecodeError:
|
||||
# string contains non-UTF-8 character
|
||||
logging.debug("string contains non-UTF-8 characters: %s", string)
|
||||
|
||||
# try to find out encoding:
|
||||
encodings = ('windows-1250', 'windows-1252', 'latin_1', 'cp850', 'cp852', 'iso8859_2', 'iso8859_15', 'mac_latin2', 'mac_roman')
|
||||
for enc in encodings:
|
||||
try:
|
||||
string = string.decode(enc)
|
||||
logging.debug("string was encoded in: %s", enc)
|
||||
break
|
||||
except Exception:
|
||||
# if exception for last encoding entry fail, raise exception
|
||||
if enc == encodings[-1]:
|
||||
logging.warning("no encoding found")
|
||||
logging.debug("no encoding found", exc_info=True)
|
||||
# no fixing possible, raise exception
|
||||
raise
|
||||
pass
|
||||
|
||||
# string should now decoded...
|
||||
|
||||
try:
|
||||
# encode decoded string to UTF-8
|
||||
uft8String = string.encode('UTF-8')
|
||||
except:
|
||||
logging.warning("encoding to UTF-8 failed")
|
||||
logging.debug("encoding to UTF-8 failed", exc_info=True)
|
||||
# no fixing possible, raise exception
|
||||
raise
|
||||
|
||||
# Now we must have an utf8-string, check it:
|
||||
try:
|
||||
uft8String.decode('UTF-8', 'strict')
|
||||
logging.debug("string converting succeeded: %s", uft8String)
|
||||
except:
|
||||
logging.warning("converting to UTF-8 failed")
|
||||
logging.debug("converting to UTF-8 failed", exc_info=True)
|
||||
# no fixing possible, raise exception
|
||||
raise
|
||||
|
||||
# End of exception: check given string is already UTF-8
|
||||
pass
|
||||
|
||||
return uft8String
|
||||
Loading…
Reference in a new issue