convert csv-files to UTF-8 #63

2026-03-07 05:23:55 +01:00 · 2015-07-14 16:49:14 +02:00 · 2015-07-14 16:49:14 +02:00 · f3db4635c7
parent e829778db2
commit f3db4635c7
2 changed files with 80 additions and 17 deletions
--- a/includes/descriptionList.py
+++ b/includes/descriptionList.py
@ -14,6 +14,7 @@ import logging # Global logger
 import csv # for loading the description files

 from includes import globals  # Global variables
+from includes.helper import uft8Converter  # UTF-8 converter


 ##
@ -37,25 +38,11 @@ def loadCSV(typ, idField):
 				logging.debug(row)
 				# only import rows with an integer as id
 				if row[idField].isdigit() == True:
-					# check if string contains non-utf8 characters
-					description = ""
 					try:
-						description = row['description'].decode('UTF-8', 'strict')
-					except UnicodeDecodeError:
-						# line contains non-utf8 character
-						logging.debug("row contains non-utf8 characters: %s", row['description'])
-						# try to find out codec:
-						encodings = ('windows-1250', 'windows-1252', 'iso-8859-1', 'iso-8859-15')
-						for enc in encodings:
-							try:
-								description = f.decode(enc)
-								break
-							except Exception:
-								pass
-						# encode in UTF-8
-						description = description.encode('UTF-8')
+						resultList[row[idField]] = uft8Converter.convertToUTF8(row['description'])
+					except:
+						# skip entry in case of an exception
 						pass
-					resultList[row[idField]] = description
 		logging.debug("-- loading csv finished")
 	except:
 		logging.error("loading csvList for typ: %s failed", typ)
--- a/includes/helper/uft8Converter.py
+++ b/includes/helper/uft8Converter.py
@ -0,0 +1,76 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+#
+
+"""
+little Helper for converting strings
+
+@author: Jens Herrmann
+"""
+
+import logging
+
+
+def convertToUTF8(string = ""):
+	"""
+	Returns given string in UTF-8
+
+	@type    string: String
+	@param   string: String to convert to UTF-8
+
+	@return:    string in UTF-8
+	@exception: Exception if converting to UTF-8 failed
+	"""
+
+	uft8String = ""
+	
+	# nothing to do if string is empty
+	if len(string) > 0:
+		try:
+			# check given string is already UTF-8, return
+			return string.decode('UTF-8', 'strict')
+		except UnicodeDecodeError:
+			# string contains non-UTF-8 character
+			logging.debug("string contains non-UTF-8 characters: %s", string)
+			
+			# try to find out encoding:
+			encodings = ('windows-1250', 'windows-1252', 'latin_1', 'cp850', 'cp852', 'iso8859_2', 'iso8859_15', 'mac_latin2', 'mac_roman')
+			for enc in encodings:
+				try:
+					string = string.decode(enc)
+					logging.debug("string was encoded in: %s", enc)
+					break
+				except Exception:
+					# if exception for last encoding entry fail, raise exception
+					if enc == encodings[-1]:
+						logging.warning("no encoding found")
+						logging.debug("no encoding found", exc_info=True)
+						# no fixing possible, raise exception
+						raise
+					pass
+			
+			# string should now decoded...
+			
+			try:
+				# encode decoded string to UTF-8
+				uft8String = string.encode('UTF-8')
+			except:
+				logging.warning("encoding to UTF-8 failed")
+				logging.debug("encoding to UTF-8 failed", exc_info=True)
+				# no fixing possible, raise exception
+				raise
+				
+			# Now we must have an utf8-string, check it:
+			try:
+				uft8String.decode('UTF-8', 'strict')
+				logging.debug("string converting succeeded: %s", uft8String)
+			except:
+				logging.warning("converting to UTF-8 failed")
+				logging.debug("converting to UTF-8 failed", exc_info=True)
+				# no fixing possible, raise exception
+				raise
+			
+			# End of exception: check given string is already UTF-8
+			pass
+	
+	return uft8String