Tags: jdbc scala oracle database
I was looking to write a scala script to fetch large data from Oracle database. I was earlier using the Alteryx software to fetch data (around 1 million rows) from oracle which took around 7 minutes. Licensed softwares like Alteryx or Informatica, etc are great, but I was looking for a free solution.
We need ojdbc6.jar to connect to the Oracle 12 c database. We will create a Scala script to fetch data from the oracle database & save it to the output file (CSV format).
Lets create a simple function timed to time our execution
| object ScalaJdbcConnect { | |
| def timed[T](label : String, code: => T) : T = { | |
| val start = System.currentTimeMillis() | |
| val result = code | |
| val stop = System.currentTimeMillis() | |
| timing.append(s”processing $label took $(stop - start) ms.\n”) | |
| result | |
| } | |
| def main(args: Array[String]): Unit { | |
| timed(“setup connection to oracle database”, connOracle) | |
| } | |
| } |
Now create function connOracle to connect to Oracle
| import java.sql.DriverManager | |
| import java.sql.Connection | |
| import oracle.jdbc.pool.OracleDataSource | |
| def connOracle : Unit = { | |
| val query = """ | |
| SELECT * FROM TABLE1 | |
| """ | |
| val connection : Connection = null | |
| val oracleUser = “ORACLE USER” | |
| val oraclePassword = “ORACLE USER PASSWORD” | |
| val oracleURL = “jdbc:oracle:thin:@//$HOST:$PORT/$SID” | |
| val ods = new OracleDataSource() | |
| ods.setUser(oracleUser) | |
| ods.setURL(oracleURL) | |
| ods.setPassword(oraclePassword) | |
| val con = ods.getConnection() | |
| val statement = con.createStatement() | |
| val resultSet : java.sql.ResultSet = statement.executeQuery(query) | |
| } |
This took me 150 minutes to fetch data 1 million records. That was not the performance I was looking for 😟
However I realized that I can increase JDBC performance by tweaking optimal fetch value. The default fetch size is 10 for JDBC. Since I have more than million rows in the database, I should increase the fetch size to 1000.
| import java.sql.DriverManager | |
| import java.sql.Connection | |
| import oracle.jdbc.pool.OracleDataSource | |
| def connOracle : Unit = { | |
| val query = """ | |
| SELECT * FROM TABLE1 | |
| """ | |
| val connection : Connection = null | |
| val oracleUser = “ORACLE USER” | |
| val oraclePassword = “ORACLE USER PASSWORD” | |
| val oracleURL = “jdbc:oracle:thin:@//$HOST:$PORT/$SID” | |
| val ods = new OracleDataSource() | |
| ods.setUser(oracleUser) | |
| ods.setURL(oracleURL) | |
| ods.setPassword(oraclePassword) | |
| val con = ods.getConnection() | |
| val statement = con.createStatement() | |
| statement.setFetchSize(1000) // important | |
| val resultSet : java.sql.ResultSet = statement.executeQuery(query) | |
| } | |
| } |
The execution time improved drastically and reduced to approx 8 minutes.
Lastly it’s trivial to save the output to flat file (CSV format)
| import java.io.FileWriter | |
| import au.com.bytecode.opencsv.CSVWriter | |
| val resultSet : java.sql.ResultSet = statement.executeQuery(query) | |
| val csvWriter = new CSVWriter(new FileWriter(“Ouput.csv”), ‘,’) | |
| csvWriter.writeAll(resultSet, true) | |
| csvWriter.close() |
Also below is the build.sbt for resolving dependency
name := “Oracle Connection”
Version := “0.1”
scalaVersion := “2.12.4”
libraryDependencies += “au.com.bytecode” % “opencsv” % “2.4”