简体   繁体   中英

Python cannot save utf-8 to oracle

I am trying to save a UTF8 string to the oracle database from python via cx_oracle and fail

Here is the string which has a special T in it: 'Strada Ion Țuculescu 20, Craiova, Romania'

Here is the db config:

PARAMETER                      VALUE                                  
------------------------------ ----------------------------------------
NLS_CHARACTERSET               EE8ISO8859P2                             
NLS_NCHAR_CHARACTERSET         AL16UTF16 

The table has target column of type nvarchar2

In python I do:

os.environ['NLS_LANG'] = '.AL32UTF8'
cur.setinputsizes(cx_Oracle.NCHAR, cx_Oracle.NCHAR)
cur.executemany('update table set var=:var where ID_=:ID_', values)

And I still get '?' instead of that special 'T'.

What am I doing wrong?

I don't know if it helps but I wrote a small PL/SQL program to find mismatches:

DECLARE
    --sourceChar VARCHAR2(10) := 'U+21A';
    --sourceChar VARCHAR2(10) := 'U+E2';
    sourceChar VARCHAR2(10) := 'Ț';

    codepoint INTEGER;
    sg1 VARCHAR2(6);
    sg2 VARCHAR2(4);

    CURSOR CharacterSets IS
    SELECT VALUE, UTL_I18N.MAP_CHARSET(VALUE) AS IANA_VALUE
    FROM V$NLS_VALID_VALUES 
    WHERE parameter = 'CHARACTERSET' 
        AND REGEXP_LIKE(UTL_I18N.MAP_CHARSET(VALUE), 'UTF|ISO-8859|WINDOWS') 
    ORDER BY IANA_VALUE, VALUE;

    CURSOR ClientCharacterSets IS
    SELECT VALUE, UTL_I18N.MAP_CHARSET(VALUE) AS IANA_VALUE
    FROM V$NLS_VALID_VALUES 
    WHERE parameter = 'CHARACTERSET' 
        AND REGEXP_LIKE(UTL_I18N.MAP_CHARSET(VALUE), 'UTF|ISO-8859|WINDOWS') 
    ORDER BY IANA_VALUE, VALUE;

BEGIN
    IF REGEXP_LIKE(sourceChar, '^U\+[[:xdigit:]]+$') THEN
        codepoint := TO_NUMBER(REGEXP_REPLACE(sourceChar, '^U\+'), 'fmXXXXX');
        IF codepoint <= 65535 THEN
            sourceChar := UNISTR('\'||LPAD(TO_CHAR(codepoint, 'fmXXXXX'), 4, '0'));
        ELSE
            sg1 := TO_CHAR(TO_NUMBER('D800', 'XXXX') + TRUNC((codepoint - 2**16) / 2**10), 'fmXXXX');
            sg2 := TO_CHAR(TO_NUMBER('DC00', 'XXXX') + (codepoint - 2**16) MOD 2**10, 'fmXXXX');
            sourceChar := UNISTR('\'||LPAD(sg1, 4, '0')||'\'||LPAD(sg2, 4, '0'));
        END IF;
    ELSE    
        IF REGEXP_LIKE(ASCIISTR(sourceChar), '^\\[[:xdigit:]]+$') THEN
            codepoint := TO_NUMBER(REGEXP_REPLACE(ASCIISTR(sourceChar), '^\\'), 'XXXXX');
        ELSIF REGEXP_LIKE(ASCIISTR(sourceChar), '^\\[[:xdigit:]]+\\[[:xdigit:]]+$') THEN
            sg1 := REGEXP_SUBSTR(ASCIISTR(sourceChar), '[[:xdigit:]]+');
            sg2 := REGEXP_SUBSTR(ASCIISTR(sourceChar), '[[:xdigit:]]+', 1, 2);
            codepoint := 2**10 * (TO_NUMBER(sg1, 'XXXXX') - TO_NUMBER('D800', 'XXXX') + 2**6) + TO_NUMBER(sg2, 'XXXX') - TO_NUMBER('DC00', 'XXXX');     
        ELSE
            codepoint := ASCII(sourceChar);
        END IF;
    END IF;
    DBMS_OUTPUT.PUT_LINE ( sourceChar || ' -> U+' || TO_CHAR(codepoint, 'fmXXXXX') );

    DBMS_OUTPUT.PUT('Character Set'||CHR(9)||'Char' );
    FOR aSet IN ClientCharacterSets LOOP
        DBMS_OUTPUT.PUT(CHR(9)|| aSet.VALUE);
    END LOOP;
    DBMS_OUTPUT.NEW_LINE();

    FOR aSrcSet IN CharacterSets LOOP
        DBMS_OUTPUT.PUT(aSrcSet.VALUE ||CHR(9)|| '0x'||RAWTOHEX(UTL_I18N.STRING_TO_RAW(sourceChar, aSrcSet.VALUE)) );
        FOR aClientSet IN ClientCharacterSets LOOP
            IF aSrcSet.VALUE = aClientSet.VALUE THEN
                DBMS_OUTPUT.PUT(CHR(9)|| aSrcSet.VALUE );
            ELSIF UTL_I18N.RAW_TO_CHAR(UTL_I18N.STRING_TO_RAW(sourceChar, aSrcSet.VALUE), aClientSet.VALUE) = sourceChar THEN
                DBMS_OUTPUT.PUT(CHR(9)|| 'Match' );
            ELSE
                DBMS_OUTPUT.PUT(CHR(9)|| UTL_I18N.RAW_TO_CHAR(UTL_I18N.STRING_TO_RAW(sourceChar, aSrcSet.VALUE), aClientSet.VALUE) );
            END IF;
        END LOOP;
        DBMS_OUTPUT.NEW_LINE();
    END LOOP;
END;

For Ț you get

Ț -> U+21A
Character Set   Char    WE8ISO8859P1    NE8ISO8859P10   BLT8ISO8859P13  CEL8ISO8859P14  WE8ISO8859P15   EE8ISO8859P2    SE8ISO8859P3    NEE8ISO8859P4   CL8ISO8859P5    AR8ISO8859P6    EL8ISO8859P7    IW8ISO8859P8    WE8ISO8859P9    AL16UTF16   AL32UTF8    UTF8    EE8MSWIN1250    CL8MSWIN1251    WE8MSWIN1252    EL8MSWIN1253    TR8MSWIN1254    IW8MSWIN1255    AR8MSWIN1256    BLT8MSWIN1257   VN8MSWIN1258
WE8ISO8859P1    0xBF    WE8ISO8859P1    ŋ   æ   ṡ   ¿   ż   ż   ŋ   П   ؟   Ώ   �   ¿           �   ż   ї   ¿   Ώ   ¿   ¿   ؟   æ   ¿
NE8ISO8859P10   0x3F    ?   NE8ISO8859P10   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
BLT8ISO8859P13  0x3F    ?   ?   BLT8ISO8859P13  ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
CEL8ISO8859P14  0x3F    ?   ?   ?   CEL8ISO8859P14  ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
WE8ISO8859P15   0xBF    ¿   ŋ   æ   ṡ   WE8ISO8859P15   ż   ż   ŋ   П   ؟   Ώ   �   ¿           �   ż   ї   ¿   Ώ   ¿   ¿   ؟   æ   ¿
EE8ISO8859P2    0x3F    ?   ?   ?   ?   ?   EE8ISO8859P2    ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
SE8ISO8859P3    0x3F    ?   ?   ?   ?   ?   ?   SE8ISO8859P3    ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
NEE8ISO8859P4   0x3F    ?   ?   ?   ?   ?   ?   ?   NEE8ISO8859P4   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
CL8ISO8859P5    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   CL8ISO8859P5    ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
AR8ISO8859P6    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   AR8ISO8859P6    ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
EL8ISO8859P7    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   EL8ISO8859P7    ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
IW8ISO8859P8    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   IW8ISO8859P8    ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
WE8ISO8859P9    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   WE8ISO8859P9        ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
AL16UTF16   0x021A                                                      AL16UTF16                                           
AL32UTF8    0xC89A  È  Č  Č  È  È  Č  È  Č  Ш  ب  Θ  �  È  좚   AL32UTF8    Match   Čš  Иљ  Èš  Θ  Èš  ָ  بڑ  Č  È
UTF8    0xC89A  È  Č  Č  È  È  Č  È  Č  Ш  ب  Θ  �  È  좚   Match   UTF8    Čš  Иљ  Èš  Θ  Èš  ָ  بڑ  Č  È
EE8MSWIN1250    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   EE8MSWIN1250    ?   ?   ?   ?   ?   ?   ?   ?
CL8MSWIN1251    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   CL8MSWIN1251    ?   ?   ?   ?   ?   ?   ?
WE8MSWIN1252    0xBF    ¿   ŋ   æ   ṡ   ¿   ż   ż   ŋ   П   ؟   Ώ   �   ¿           �   ż   ї   WE8MSWIN1252    Ώ   ¿   ¿   ؟   æ   ¿
EL8MSWIN1253    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   EL8MSWIN1253    ?   ?   ?   ?   ?
TR8MSWIN1254    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   TR8MSWIN1254    ?   ?   ?   ?
IW8MSWIN1255    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   IW8MSWIN1255    ?   ?   ?
AR8MSWIN1256    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   AR8MSWIN1256    ?   ?
BLT8MSWIN1257   0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   BLT8MSWIN1257   ?
VN8MSWIN1258    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   VN8MSWIN1258

For â you get

â -> U+E2
Character Set   Char    WE8ISO8859P1    NE8ISO8859P10   BLT8ISO8859P13  CEL8ISO8859P14  WE8ISO8859P15   EE8ISO8859P2    SE8ISO8859P3    NEE8ISO8859P4   CL8ISO8859P5    AR8ISO8859P6    EL8ISO8859P7    IW8ISO8859P8    WE8ISO8859P9    AL16UTF16   AL32UTF8    UTF8    EE8MSWIN1250    CL8MSWIN1251    WE8MSWIN1252    EL8MSWIN1253    TR8MSWIN1254    IW8MSWIN1255    AR8MSWIN1256    BLT8MSWIN1257   VN8MSWIN1258
WE8ISO8859P1    0xE2    WE8ISO8859P1    Match   ā   Match   Match   Match   Match   Match   т   ق   β   ג   Match               Match   в   Match   β   Match   ג   Match   ā   Match
NE8ISO8859P10   0xE2    Match   NE8ISO8859P10   ā   Match   Match   Match   Match   Match   т   ق   β   ג   Match               Match   в   Match   β   Match   ג   Match   ā   Match
BLT8ISO8859P13  0x3F    ?   ?   BLT8ISO8859P13  ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
CEL8ISO8859P14  0xE2    Match   Match   ā   CEL8ISO8859P14  Match   Match   Match   Match   т   ق   β   ג   Match               Match   в   Match   β   Match   ג   Match   ā   Match
WE8ISO8859P15   0xE2    Match   Match   ā   Match   WE8ISO8859P15   Match   Match   Match   т   ق   β   ג   Match               Match   в   Match   β   Match   ג   Match   ā   Match
EE8ISO8859P2    0xE2    Match   Match   ā   Match   Match   EE8ISO8859P2    Match   Match   т   ق   β   ג   Match               Match   в   Match   β   Match   ג   Match   ā   Match
SE8ISO8859P3    0xE2    Match   Match   ā   Match   Match   Match   SE8ISO8859P3    Match   т   ق   β   ג   Match               Match   в   Match   β   Match   ג   Match   ā   Match
NEE8ISO8859P4   0xE2    Match   Match   ā   Match   Match   Match   Match   NEE8ISO8859P4   т   ق   β   ג   Match               Match   в   Match   β   Match   ג   Match   ā   Match
CL8ISO8859P5    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   CL8ISO8859P5    ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
AR8ISO8859P6    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   AR8ISO8859P6    ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
EL8ISO8859P7    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   EL8ISO8859P7    ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
IW8ISO8859P8    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   IW8ISO8859P8    ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?
WE8ISO8859P9    0xE2    Match   Match   ā   Match   Match   Match   Match   Match   т   ق   β   ג   WE8ISO8859P9                Match   в   Match   β   Match   ג   Match   ā   Match
AL16UTF16   0x00E2  â   â   ā   â   â   â   â   â   т   ق   β   ג   â   AL16UTF16           â   в   â   β   â   ג   â   ā   â
AL32UTF8    0xC3A2  â  ÃĒ  Ć¢  Ãḃ  â  â  �˘  Ãĸ  УЂ  أ�  Γ’  �¢  â  쎢   AL32UTF8    Match   â  Гў  â  ΓΆ  â  ֳ¢  أ¢  Ć¢  Ă¢
UTF8    0xC3A2  â  ÃĒ  Ć¢  Ãḃ  â  â  �˘  Ãĸ  УЂ  أ�  Γ’  �¢  â  쎢   Match   UTF8    â  Гў  â  ΓΆ  â  ֳ¢  أ¢  Ć¢  Ă¢
EE8MSWIN1250    0xE2    Match   Match   ā   Match   Match   Match   Match   Match   т   ق   β   ג   Match               EE8MSWIN1250    в   Match   β   Match   ג   Match   ā   Match
CL8MSWIN1251    0x61    a   a   a   a   a   a   a   a   a   a   a   a   a       a   a   a   CL8MSWIN1251    a   a   a   a   a   a   a
WE8MSWIN1252    0xE2    Match   Match   ā   Match   Match   Match   Match   Match   т   ق   β   ג   Match               Match   в   WE8MSWIN1252    β   Match   ג   Match   ā   Match
EL8MSWIN1253    0x61    a   a   a   a   a   a   a   a   a   a   a   a   a       a   a   a   a   a   EL8MSWIN1253    a   a   a   a   a
TR8MSWIN1254    0xE2    Match   Match   ā   Match   Match   Match   Match   Match   т   ق   β   ג   Match               Match   в   Match   β   TR8MSWIN1254    ג   Match   ā   Match
IW8MSWIN1255    0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   IW8MSWIN1255    ?   ?   ?
AR8MSWIN1256    0xE2    Match   Match   ā   Match   Match   Match   Match   Match   т   ق   β   ג   Match               Match   в   Match   β   Match   ג   AR8MSWIN1256    ā   Match
BLT8MSWIN1257   0x3F    ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?   ?       ?   ?   ?   ?   ?   ?   ?   ?   ?   BLT8MSWIN1257   ?
VN8MSWIN1258    0xE2    Match   Match   ā   Match   Match   Match   Match   Match   т   ق   β   ג   Match               Match   в   Match   β   Match   ג   Match   ā   VN8MSWIN1258

You see â has much more matches than Ț , ie the bit value is the same in many character sets. Try some more characters, perhaps then you find which character set is used by python and also how to set it.

This is the main part of the code which ignores the rest. I am using python3 and the oracle part is run on windows7

if args.proxy_pwd is not None:
    print('Setting proxy...')
    set_proxy(args.proxy_pwd)

if args.db_pwd is None:     # reading from csv file
    print('Reading sourcefile...')
    df = pd.read_csv(params[args.source][0], sep = ';')
else:                       # reading from db
    print('Reading db...')

    import cx_Oracle
    con = cx_Oracle.connect(args.db_usr, args.db_pwd, args.db_type)
    df = pd.read_sql_query('''select * from geo_poi 
                              where geo_status is Null and rownum < 3000''', 
                           con)

    #print(df)

df = update_geo(df, params[args.source][1]) # query google api

# save results
print('Saving...')
if args.db_pwd is None: # to csv
    df.to_csv(params[args.source][0], index=False, sep = ';')
else:                   # to db
    df = df[df['GEO_STATUS'].notnull()]     # save only geocoded results

    columns = ['GEO_STATUS', 'GEO_ADDRESS', 'GEO_POSTCODE',
               'GEO_TYPE', 'GEO_LATITUDE', 'GEO_LONGITUDE',
               'GEO_ACCURACY', 'GEO_GOOGLE_ID', 'GEO_NUMBER_OF_RESULTS',
               'TYPE_', 'ID_']
    values = [[df.loc[i, j] for j in columns] for i in range(len(df))]
    #print(values)

    cur = con.cursor()
    cur.executemany('''update geo_poi
                       set GEO_STATUS=:GEO_STATUS, GEO_ADDRESS=:GEO_ADDRESS, 
                           GEO_POSTCODE=:GEO_POSTCODE, GEO_TYPE=:GEO_TYPE, 
                           GEO_LATITUDE=:GEO_LATITUDE, GEO_LONGITUDE=:GEO_LONGITUDE,
                           GEO_ACCURACY=:GEO_ACCURACY, GEO_GOOGLE_ID=:GEO_GOOGLE_ID, 
                           GEO_NUMBER_OF_RESULTS=:GEO_NUMBER_OF_RESULTS
                       where TYPE_=:TYPE_ and ID_=:ID_''',
                     values)
    con.commit()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM