Friday, May 15, 2015

c# - Extract data from Green Plum Warehouse Tables PostGreSql

Below are two ways to extract data from Green Plum Warehouse tables.For tables that are small the code is straightforward.However for tables with millions of rows the steps are a little different.
I have listed the code for both here.

I have used the Npgsql.dll to create the connection to progresql Green Plum Warehouse.Add it as a reference in your C# project.Documentation can be found here http://npgsql.projects.pgfoundry.org/

1) For tables with small number of rows.

using Npgsql;
using System;
using System.Configuration;

namespace GPExtract
{
    class Program
    {
        static void Main(string[] args)
        {
            try
            {
                ExtractData();
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
                Console.WriteLine(ex.StackTrace);
            }
            Console.WriteLine("Done !");
            Console.Read();
        }
        private static void ExtractData()
        {
            using (NpgsqlConnection oConn = new                             NpgsqlConnection(ConfigurationManager.AppSettings["GP_CONNECTION_STRING"]))
            {
                       NpgsqlCommand oCmd = new NpgsqlCommand(@"select distinct(id),min(create_ts),max(create_ts) from mytable group by id order by id", oConn);
                       oCmd.CommandTimeout = int.Parse(ConfigurationManager.AppSettings["COMMAND_TIMEOUT"]);
                      NpgsqlDataReader oReader = oCmd.ExecuteReader();

                      using (StreamWriter oWriter = new StreamWriter(ConfigurationManager.AppSettings["OUTPUT_FOLDER_PATH"] + "myoutputfile.txt"))
                      {
                              while (oReader.Read())
                              {
                                        oWriter.WriteLine(string.Format("{0}\t{1}\t{2}", dr.GetValue(0) != null ? dr.GetValue(0).ToString() : "", dr.GetValue(1) != null ? GetFormattedDateTime(dr.GetValue(1).ToString()) : "", dr.GetValue(2) != null ? GetFormattedDateTime(dr.GetValue(2).ToString()) : ""));
                               }
                       }
             }
         }
    }
}

2) In case of millions of rows the extraction of all rows is done by creating file on Green Plum accessible share, a temporary external table and loading the external table with the data from the Green Plum warehouse table. 

using Npgsql;
using System;
using System.Configuration;

namespace GPExtract
{
    class Program
    {
        static void Main(string[] args)
        {
            try
            {
                string sGpFilePath = ConfigurationManager.AppSettings["GP_FILE_PATH"];
                string sOutputFilePath = ConfigurationManager.AppSettings["OUTPUT_FILE_PATH"];
                ExtractData(sGpFilePath,sOutputFilePath);
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
                Console.WriteLine(ex.StackTrace);
            }
            Console.WriteLine("Done !");
            Console.Read();
        }
        private static void ExtractData(string sGpFilePath,string sOutputFilePath)
        {
                if(!File.Exists(sGPFilePath)
               {
                          //Create a file for GP to load from external table
                          File.Create(sGpFilePath);

                          string sSql = @"CREATE WRITABLE EXTERNAL TEMP TABLE temp_mytable (  id character varying(22),
  min_create_dt integer,
  max_create_dt integer)LOCATION (
                'gpfdist://server-1:8082/mylogin/DATA/myfile.txt',
                'gpfdist://server-1:8083/mylogin/DATA/myfile.txt',
                'gpfdist://server-1:8084/mylogin/DATA/myfile.txt',
                'gpfdist://server-1:8085/mylogin/DATA/myfile.txt')
                FORMAT 'text' (delimiter E'\t' null '' )
                ENCODING 'LATIN9'
                DISTRIBUTED randomly;
            insert into temp_mytable
                select distinct(id),min(create_dt),max(create_dt) from  mytable group by id order by id;";

            using (NpgsqlConnection oConn = new NpgsqlConnection(ConfigurationManager.AppSettings["GP_CONNECTION_STRING"]))
            {
                oConn.Open();
                NpgsqlCommand oCmd = new NpgsqlCommand(sSql, oConn);
                oCmd.ExecuteNonQuery();
            }
             //Move the file from GP share to server
              File.Move(sGpFilePath, sOutputFilePath);
        }
    }
}

Note: The second program is parallel extract across ports 8082-8085 and is faster.

No comments: