Monday, October 25, 2010

Remove non-printable / Unicode characters in SQL Server 2005

A few months ago, I was upgrading some report templates from the older version of Excel (*.xls) to Excel 2007 (*.xlsx). I ran into numerous problems almost immediately when I attempted to generate the upgraded reports because the incoming data was riddled with charaters that don't play nicely with XML. The data is used for a variety of reporting purposes, so I decided to tackle the problem on the back-end by removing all but the printable ascii characters.

I started by writing a simple user function for individual strings, but I got to thinking that I may want to automate some of these cleanup tasks and ended up putting something together that allows for a bit more the flexibility. The following creates the basic string user function, along with two procedures to perform the cleanup at the column and table level:



Function: fn_npclean_string
use [master]
go
set ansi_nulls on
go
set quoted_identifier on
go
CREATE function [dbo].[fn_npclean_string] (
 @strIn as varchar(1000)
)
returns varchar(1000)
as
begin
 declare @iPtr as int
 set @iPtr = patindex('%[^ -~0-9A-Z]%', @strIn COLLATE LATIN1_GENERAL_BIN)
 while @iPtr > 0 begin
  set @strIn = replace(@strIn COLLATE LATIN1_GENERAL_BIN, substring(@strIn, @iPtr, 1), '')
  set @iPtr = patindex('%[^ -~0-9A-Z]%', @strIn COLLATE LATIN1_GENERAL_BIN)
 end
 return @strIn
end


Procedure: sp_npclean_col
use [master]
go
set ansi_nulls on
go
set quoted_identifier on
go
CREATE procedure [dbo].[sp_npclean_col]
 @DatabaseName varchar(75) = null,
 @SchemaName varchar(75) = null,
 @TableName varchar(75),
 @ColumnName varchar(75)
as
begin
 Declare @FullTableName varchar(100)
 declare @UpdateSQL nvarchar(1000)
 if @DatabaseName is null begin
  set @DatabaseName = db_name()
 end
 if @SchemaName is null begin
  set @SchemaName = schema_name()
 end
 set @FullTableName = '[' + @DatabaseName + '].[' + @SchemaName + '].[' + @TableName + ']'
 set @UpdateSQL = 'update ' + @FullTableName + ' set [' + @ColumnName + '] = dbo.fn_npclean_string([' + @ColumnName + ']) where [' + @ColumnName + '] like ''%[^ -~0-9A-Z]%'''
 exec sp_ExecuteSQL @UpdateSQL
end


Procedure: sp_npclean_table
use [master]
go
set ansi_nulls on
go
set quoted_identifier on
go
create procedure [dbo].[sp_npclean_table] 
 @TargetDatabase varchar(75) = null,
 @TargetSchema varchar(75) = null,
 @TargetTable varchar(75)
as
begin
 declare @getColSQL nvarchar(750)
 declare @textCol CURSOR
 declare @curCol varchar(75)
 if @TargetDatabase is null begin
  set @TargetDatabase = db_name()
 end
 if @TargetSchema is null begin
  set @TargetSchema = schema_name()
 end
 set @getColSQL =
  'select sc.name
  from ' + @TargetDatabase + '.sys.columns sc
  join ' + @TargetDatabase + '.sys.types st
  on sc.system_type_id = st.system_type_id
  join ' + @TargetDatabase + '.sys.objects so
  on sc.object_id = so.object_id
  join ' + @TargetDatabase + '.sys.schemas ss
  on so.schema_id = ss.schema_id
  where
  so.type = ''U''
  and st.name in (''text'',''ntext'',''varchar'',''char'',''nvarchar'',''nchar'')
  and sc.is_rowguidcol = 0
  and sc.is_identity = 0
  and sc.is_computed = 0
  and so.name = ''' + @TargetTable + '''
  and ss.name = ''' + @TargetSchema + ''''
 set @getColSQL = 'set @inCursor = cursor for ' + @getColSQL + ' open @incursor'
 execute sp_executesql @getColSQL,N'@inCursor cursor out',@inCursor=@textCol OUT
 fetch next from @textCol into @curCol
 while @@fetch_status = 0
 begin
  exec sp_npclean_col @DatabaseName = @TargetDatabase, @SchemaName = @TargetSchema, @TableName = @TargetTable, @ColumnName = @curCol
  fetch next from @textCol into @curCol
 end
 Close @textCol
 DeAllocate @textCol
end

Using these, invalid characters can be removed in the following ways:



By String:
select master.dbo.fn_npclean_string('Stringğ withħ įņvalidđ charactersŝ')

By table column:
exec master.dbo.sp_npclean_col [@DatabaseName = 'MyDatabaseName',] [@SchemaName = 'MySchemaName',] @TableName = 'MyTableName',  @ColumnName = 'MyColumnName'

By table:
exec master.dbo.sp_npclean_table [@TargetDatabase = 'MyDatabaseName',] [@TargetSchema = 'MySchemaName',] @TargetTable = 'MyTableName'