Dataframe to Oracle creates table with case sensitive column

前端 未结 2 414
逝去的感伤
逝去的感伤 2021-01-21 15:38

Spark: 2.1.1

I am saving my dataframe as an Oracle table but the resultant Oracle table has \"case sensitive

相关标签:
2条回答
  • 2021-01-21 16:20

    I think the fix should take into account the issue that cause the change: SPARK-16387

    import org.apache.spark.sql.types.{DataType, MetadataBuilder}
    
    object AxOracleDialect extends JdbcDialect {
    
      def register(): Unit = {
        JdbcDialects.unregisterDialect(OracleDialect)
        JdbcDialects.registerDialect(this)
      }
    
      private val reservedWordsAndKeywords = Set(
        "ACCESS", "ACCOUNT", "ACTIVATE", "ADD", "ADMIN", "ADVISE", "AFTER", "ALL", "ALL_ROWS",
        "ALLOCATE", "ALTER", "ANALYZE", "AND", "ANY", "ARCHIVE", "ARCHIVELOG", "ARRAY", "AS",
        "ASC", "AT", "AUDIT", "AUTHENTICATED", "AUTHORIZATION", "AUTOEXTEND", "AUTOMATIC", "BACKUP", "BECOME",
        "BEFORE", "BEGIN", "BETWEEN", "BFILE", "BITMAP", "BLOB", "BLOCK", "BODY", "BY",
        "CACHE", "CACHE_INSTANCES", "CANCEL", "CASCADE", "CAST", "CFILE", "CHAINED", "CHANGE", "CHAR",
        "CHAR_CS", "CHARACTER", "CHECK", "CHECKPOINT", "CHOOSE", "CHUNK", "CLEAR", "CLOB", "CLONE",
        "CLOSE", "CLOSE_CACHED_OPEN_CURSORS", "CLUSTER", "COALESCE", "COLUMN", "COLUMNS", "COMMENT", "COMMIT", "COMMITTED",
        "COMPATIBILITY", "COMPILE", "COMPLETE", "COMPOSITE_LIMIT", "COMPRESS", "COMPUTE", "CONNECT", "CONNECT_TIME", "CONSTRAINT",
        "CONSTRAINTS", "CONTENTS", "CONTINUE", "CONTROLFILE", "CONVERT", "COST", "CPU_PER_CALL", "CPU_PER_SESSION", "CREATE",
        "CURRENT", "CURRENT_SCHEMA", "CURREN_USER", "CURSOR", "CYCLE,DANGLING", "DATABASE", "DATAFILE", "DATAFILES", "DATAOBJNO",
        "DATE", "DBA", "DBHIGH", "DBLOW", "DBMAC", "DEALLOCATE", "DEBUG", "DEC", "DECIMAL",
        "DECLARE", "DEFAULT", "DEFERRABLE", "DEFERRED", "DEGREE", "DELETE", "DEREF", "DESC", "DIRECTORY",
        "DISABLE", "DISCONNECT", "DISMOUNT", "DISTINCT", "DISTRIBUTED", "DML", "DOUBLE", "DROP", "DUMP",
        "EACH", "ELSE", "ENABLE", "END", "ENFORCE", "ENTRY", "ESCAPE", "EXCEPT", "EXCEPTIONS", "EXCHANGE",
        "EXCLUDING", "EXCLUSIVE", "EXECUTE", "EXISTS", "EXPIRE", "EXPLAIN", "EXTENT", "EXTENTS", "EXTERNALLY",
        "FAILED_LOGIN_ATTEMPTS", "FALSE", "FAST", "FILE", "FIRST_ROWS", "FLAGGER", "FLOAT", "FLOB", "FLUSH",
        "FOR", "FORCE", "FOREIGN", "FREELIST", "FREELISTS", "FROM", "FULL", "FUNCTION", "GLOBAL",
        "GLOBALLY", "GLOBAL_NAME", "GRANT", "GROUP", "GROUPS", "HASH", "HASHKEYS", "HAVING", "HEADER", "HEAP",
        "IDENTIFIED", "IDGENERATORS", "IDLE_TIME", "IF", "IMMEDIATE", "IN", "INCLUDING", "INCREMENT", "INDEX", "INDEXED",
        "INDEXES", "INDICATOR", "IND_PARTITION", "INITIAL", "INITIALLY", "INITRANS", "INSERT", "INSTANCE", "INSTANCES", "INSTEAD",
        "INT", "INTEGER", "INTERMEDIATE", "INTERSECT", "INTO", "IS", "ISOLATION", "ISOLATION_LEVEL", "KEEP", "KEY", "KILL", "LABEL",
        "LAYER", "LESS", "LEVEL", "LIBRARY", "LIKE", "LIMIT", "LINK", "LIST", "LOB", "LOCAL", "LOCK", "LOCKED", "LOG", "LOGFILE",
        "LOGGING", "LOGICAL_READS_PER_CALL", "LOGICAL_READS_PER_SESSION", "LONG", "MANAGE", "MASTER", "MAX", "MAXARCHLOGS",
        "MAXDATAFILES", "MAXEXTENTS", "MAXINSTANCES", "MAXLOGFILES", "MAXLOGHISTORY", "MAXLOGMEMBERS", "MAXSIZE", "MAXTRANS",
        "MAXVALUE", "MIN", "MEMBER", "MINIMUM", "MINEXTENTS", "MINUS", "MINVALUE", "MLSLABEL", "MLS_LABEL_FORMAT", "MODE", "MODIFY",
        "MOUNT", "MOVE", "MTS_DISPATCHERS", "MULTISET", "NATIONAL", "NCHAR", "NCHAR_CS", "NCLOB", "NEEDED", "NESTED", "NETWORK",
        "NEW", "NEXT", "NOARCHIVELOG", "NOAUDIT", "NOCACHE", "NOCOMPRESS", "NOCYCLE", "NOFORCE", "NOLOGGING", "NOMAXVALUE", "NOMINVALUE",
        "NONE", "NOORDER", "NOOVERRIDE", "NOPARALLEL", "NOPARALLEL", "NOREVERSE", "NORMAL", "NOSORT", "NOT", "NOTHING", "NOWAIT", "NULL",
        "NUMBER", "NUMERIC", "NVARCHAR2", "OBJECT", "OBJNO", "OBJNO_REUSE", "OF", "OFF", "OFFLINE", "OID", "OIDINDEX", "OLD", "ON",
        "ONLINE", "ONLY", "OPCODE", "OPEN", "OPTIMAL", "OPTIMIZER_GOAL", "OPTION", "OR", "ORDER", "ORGANIZATION", "OSLABEL", "OVERFLOW",
        "OWN", "PACKAGE", "PARALLEL", "PARTITION", "PASSWORD", "PASSWORD_GRACE_TIME", "PASSWORD_LIFE_TIME", "PASSWORD_LOCK_TIME",
        "PASSWORD_REUSE_MAX", "PASSWORD_REUSE_TIME", "PASSWORD_VERIFY_FUNCTION", "PCTFREE", "PCTINCREASE", "PCTTHRESHOLD", "PCTUSED",
        "PCTVERSION", "PERCENT", "PERMANENT", "PLAN", "PLSQL_DEBUG", "POST_TRANSACTION", "PRECISION", "PRESERVE", "PRIMARY", "PRIOR",
        "PRIVATE", "PRIVATE_SGA", "PRIVILEGE", "PRIVILEGES", "PROCEDURE", "PROFILE", "PUBLIC", "PURGE", "QUEUE", "QUOTA", "RANGE",
        "RAW", "RBA", "READ", "READUP", "REAL", "REBUILD", "RECOVER", "RECOVERABLE", "RECOVERY", "REF", "REFERENCES", "REFERENCING",
        "REFRESH", "RENAME", "REPLACE", "RESET", "RESETLOGS", "RESIZE", "RESOURCE", "RESTRICTED", "RETURN", "RETURNING", "REUSE",
        "REVERSE", "REVOKE", "ROLE", "ROLES", "ROLLBACK", "ROW", "ROWID", "ROWNUM", "ROWS", "RULE", "SAMPLE", "SAVEPOINT", "SB4",
        "SCAN_INSTANCES", "SCHEMA", "SCN", "SCOPE", "SD_ALL", "SD_INHIBIT", "SD_SHOW", "SEGMENT", "SEG_BLOCK", "SEG_FILE", "SELECT",
        "SEQUENCE", "SERIALIZABLE", "SESSION", "SESSION_CACHED_CURSORS", "SESSIONS_PER_USER", "SET", "SHARE", "SHARED", "SHARED_POOL",
        "SHRINK", "SIZE", "SKIP", "SKIP_UNUSABLE_INDEXES", "SMALLINT", "SNAPSHOT", "SOME", "SORT", "SPECIFICATION", "SPLIT",
        "SQL_TRACE", "STANDBY", "START", "STATEMENT_ID", "STATISTICS", "STOP", "STORAGE", "STORE", "STRUCTURE", "SUCCESSFUL",
        "SWITCH", "SYS_OP_ENFORCE_NOT_NULL$", "SYS_OP_NTCIMG$", "SYNONYM", "SYSDATE", "SYSDBA", "SYSOPER", "SYSTEM", "TABLE",
        "TABLES", "TABLESPACE", "TABLESPACE_NO", "TABNO", "TEMPORARY", "THAN", "THE", "THEN", "THREAD", "TIMESTAMP", "TIME", "TO",
        "TOPLEVEL", "TRACE", "TRACING", "TRANSACTION", "TRANSITIONAL", "TRIGGER", "TRIGGERS", "TRUE", "TRUNCATE", "TX", "TYPE", "UB2",
        "UBA", "UID", "UNARCHIVED", "UNDO", "UNION", "UNIQUE", "UNLIMITED", "UNLOCK", "UNRECOVERABLE", "UNTIL", "UNUSABLE", "UNUSED",
        "UPDATABLE", "UPDATE", "USAGE", "USE", "USER", "USING", "VALIDATE", "VALIDATION", "VALUE", "VALUES", "VARCHAR", "VARCHAR2",
        "VARYING", "VIEW", "WHEN", "WHENEVER", "WHERE", "WITH", "WITHOUT", "WORK", "WRITE", "WRITEDOWN", "WRITEUP", "XID", "YEAR",
        "ZONE"
      )
    
      override def canHandle(url: String): Boolean = OracleDialect.canHandle(url)
    
      override def getCatalystType(sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] =
        OracleDialect.getCatalystType(sqlType, typeName, size, md)
    
      override def getJDBCType(dt: DataType): Option[JdbcType] = OracleDialect.getJDBCType(dt)
    
      override def compileValue(value: Any): Any = OracleDialect.compileValue(value)
    
      override def isCascadingTruncateTable(): Option[Boolean] = OracleDialect.isCascadingTruncateTable()
    
      override def getTruncateQuery(table: String, cascade: Option[Boolean] = isCascadingTruncateTable()): String = OracleDialect.getTruncateQuery(table, cascade)
    
      override def quoteIdentifier(colName: String): String =
        if (reservedWordsAndKeywords.contains(colName.toUpperCase)) super.quoteIdentifier(colName)
        else colName
    }
    
    0 讨论(0)
  • 2021-01-21 16:22

    I found the issue and solution : Starting Spark 2.x every columnName gets double quoted while creating table and hence the resultant Oracle table's columnNames become case-sensitive when you try to query them via sqlPlus.

    dialect.quoteIdentifier
    [https://github.com/apache/spark/blob/branch-2.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L645]

    and this dialect.quoteIdentifier is Double quotes ["]

      def quoteIdentifier(colName: String): String = {
        s""""$colName""""
      }
    

    [https://github.com/apache/spark/blob/branch-2.1/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala#L90]

    Solution : De-register existing OracleDialect and Re-register while overriding dialect.quoteIdentifier along with other necessary stuff needed to work with Oracle Dialect

    import java.sql.Types
    import org.apache.spark.sql.types._
    import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils
    import org.apache.spark.sql.jdbc.{ JdbcDialects, JdbcType, JdbcDialect }
    
    
    val url= "jdbc:oracle:thin:@HOST:1567/SID"
    
    val dialect = JdbcDialects
    JdbcDialects.unregisterDialect(dialect.get(url))
    
    val OracleDialect = new JdbcDialect {
      override def canHandle(url: String): Boolean = url.startsWith("jdbc:oracle") || url.contains("oracle")
    
      override def getCatalystType(sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
        // Handle NUMBER fields that have no precision/scale in special way because JDBC ResultSetMetaData converts this to 0 procision and -127 scale
        if (sqlType == Types.NUMERIC && size == 0) {
          // This is sub-optimal as we have to pick a precision/scale in advance whereas the data in Oracle is allowed 
          //  to have different precision/scale for each value.  This conversion works in our domain for now though we 
          //  need a more durable solution.  Look into changing JDBCRDD (line 406):
          //    FROM:  mutableRow.update(i, Decimal(decimalVal, p, s))
          //    TO:  mutableRow.update(i, Decimal(decimalVal))
          Some(DecimalType(DecimalType.MAX_PRECISION, 10))
        } // Handle Timestamp with timezone (for now we are just converting this to a string with default format)
        //else if (sqlType == -101) {
        // Some(StringType)
        // } 
        else None
      }
    
      override def getJDBCType(dt: DataType): Option[JdbcType] = dt match {
        case StringType            => Some(JdbcType("VARCHAR2(2000)", java.sql.Types.VARCHAR))
        case BooleanType           => Some(JdbcType("NUMBER(1)", java.sql.Types.NUMERIC))
        case IntegerType           => Some(JdbcType("NUMBER(10)", java.sql.Types.NUMERIC))
        case LongType              => Some(JdbcType("NUMBER(19)", java.sql.Types.NUMERIC))
        case DoubleType            => Some(JdbcType("NUMBER(19,4)", java.sql.Types.NUMERIC))
        case FloatType             => Some(JdbcType("NUMBER(19,4)", java.sql.Types.NUMERIC))
        case ShortType             => Some(JdbcType("NUMBER(5)", java.sql.Types.NUMERIC))
        case ByteType              => Some(JdbcType("NUMBER(3)", java.sql.Types.NUMERIC))
        case BinaryType            => Some(JdbcType("BLOB", java.sql.Types.BLOB))
        case TimestampType         => Some(JdbcType("DATE", java.sql.Types.TIMESTAMP))
        case DateType              => Some(JdbcType("DATE", java.sql.Types.DATE))
        //case DecimalType.Fixed(precision, scale) => Some(JdbcType("NUMBER(" + precision + "," + scale + ")", java.sql.Types.NUMERIC))
        //case DecimalType.Unlimited => Some(JdbcType("NUMBER(38,4)", java.sql.Types.NUMERIC))
        case _                     => None
      }
    
      //Imp from Spark2.0 since otherwise oracle table columns would be case-sensitive
      override def quoteIdentifier(colName: String): String = {
        colName
      }
    
    }
    
    JdbcDialects.registerDialect(OracleDialect)
    
    0 讨论(0)
提交回复
热议问题