Changes

Jump to navigation Jump to search
2,013 bytes added ,  17:31, 30 January 2018
==Cleaning firmbase==
 
NOTE FROM MEGHANA:
 
*The base table - firmbase - was reloaded after it was found to be incorrectly normalized.
*[[Retrieving_US_VC_Data_From_SDC#VC_Funds]] was corrected
*The code below has been rerun!
 
First flag the undisclosed funds.
 
The firmbase table contains undisclosed firms. Add a flag and remove them. Then use firmname, statecode, foundingdate as the key for this table. Check that is valid and make your core table.
 
DROP TABLE firmbase1;
CREATE TABLE firmbase1 AS
SELECT *, CASE
ELSE 0::int END AS undisclosedflag
FROM firmbase;
--145679734
SELECT COUNT(*) FROM firmbase1 WHERE undisclosedflag = 0;
--141459452
SELECT COUNT(*) FROM (SELECT DISTINCT firmname, statecode, foundingdate FROM firmbase1 WHERE undisclosedflag = 0)a;
--141459452  DROP TABLE firmbasecore;
CREATE TABLE firmbasecore AS
SELECT * FROM firmbase1 WHERE undisclosedflag = 0;
--141459452
Instead we chose to use only firmname as the key because there were not too many duplicates. We remove the duplicates by selecting the lesser foundingdate.
GROUP BY firmname
HAVING COUNT(*) > 1;
--126
DROP TABLE firmbaseinclude;
INNER JOIN firmbaseduplicates AS d ON f.firmname = d.firmname
GROUP BY f.firmname;
--126
DROP TABLE firmbasecorefirmbasec; CREATE TABLE firmbasecore firmbasec AS SELECT l.*
FROM firmbase1 AS l
LEFT JOIN firmbaseinclude AS r ON r.firmname = l.firmname AND r.foundingdate = l.foundingdate
WHERE r.firmname IS NULL AND undisclosedflag = 0;
--141339446  DROP TABLE firmbasecore; CREATE TABLE firmbasecore AS SELECT * FROM firmbasec WHERE firmname NOT LIKE 'Amundi%';
SELECT COUNT(DISTINCT firmname) FROM firmbasecore;
--141339445 NOTE: We have dropped 'Amundi' for reasons specified in "Cleaning branchoffices," below. ==Cleaning branchoffices==  DROP TABLE branchofficesbase; CREATE TABLE branchofficesbase AS SELECT *, CASE WHEN firmname LIKE '%Undisclosed Firm%' THEN 1::int ELSE 0::int END AS undisclosedflag FROM branchoffices; --9734  SELECT COUNT(*) FROM branchofficesbase WHERE undisclosedflag = 0; --9452  DROP TABLE branchofficecore; CREATE TABLE branchofficecore AS SELECT * FROM branchofficesbase WHERE undisclosedflag = 0; --9452  DROP TABLE bocore; CREATE TABLE bocore AS SELECT * FROM branchofficecore WHERE firmname not like 'Amundi%'; --9450  SELECT COUNT(*) FROM (SELECT DISTINCT firmname FROM bocore WHERE bocity IS NOT NULL) AS T; --1962 There are no duplicates in firmbasecore. In firmbase, 'Amundi Private Equity Funds SA' is the only duplicate, but we dropped the one with the earlier founding date when we cleaned firmbase. Amundi has 2 branch offices in the same city (Bucharest) but different zip codes, which is why there are different counts when we use firmname/firmname,bocity vs firmname, bozip as our key. NOTE: We have dropped "Amundi" from firmbasecore, as well as branchofficecore, in order to use firmname as a valid key for bocore.
==Cleaning fundbase==
NOTE FROM ED:
*The base table - '''fundbase''' - was reloaded after it was found to be incorrectly normalized.
**All sorts of fields were copied down when they shouldn't have been.
*[[Retrieving_US_VC_Data_From_SDC#VC_Funds]] was corrected
*The code below HAS NOT in this section has been rerun!.
First flag the undisclosed funds.
DROP TABLE fundbase1;
CREATE TABLE fundbase1 AS
SELECT *, CASE
FROM fundbase;
--27588
 
SELECT COUNT(*) FROM fundbase1 WHERE undisclosedflag = 0;
--27097
 
SELECT COUNT(*) FROM (SELECT DISTINCT fundname, firstinvdate FROM fundbase1 WHERE undisclosedflag = 0)a;
--27097
 
You can see that fundname, firstinvdate is a good key. But we're going to use simply the fundname as a key because it will be easier to do join operations later.
 
DROP TABLE fundbasecore;
CREATE TABLE fundbasecore AS
SELECT *
SELECT COUNT(*) FROM (SELECT DISTINCT fundname FROM fundbase1 WHERE undisclosedflag = 0)a;
--27050
 
The plan is to grab all the duplicate fundnames and only include the ones with the MIN(closedate) AND MIN(lastinvdate) in the fundbasecore table.
 
DROP TABLE fundnameexclude;
CREATE TABLE fundnameexclude AS
FROM fundbase1 AS f
INNER JOIN fundinclude AS fu ON f.fundname = fu.fundname AND f.closedate = fu.closedate AND f.lastinvdate = fu.lastinvdate;
--4417
--create fundcore table
UNION ALL
SELECT * FROM fundinclude2;
--2704727020
==Name based matching firms to funds==
SELECT fundname, firstinvdate, firmname
FROM fundbasecore;
--2709727020
\COPY fundkeysandfirms TO 'fundkeysandfirms.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
FROM firmbasecore;
\COPY firmkeys TO 'firmkeys.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
--141459446
CREATE TABLE matcherfirmsfunds (
\COPY matcherfirmsfunds FROM 'matcheroutputfundsfirms.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
--2364
 
==Joining firms with funds==
DROP TABLE firmfundstestjoin;
==Creating portcoexitmaster==
Portcoexitmaster contains the portcokey with an exitflag, ipoflag and maflag and an exit value. It is built off the companybaseipomasmaster table so be sure you've built this first.Note that the amounts are actually in m (not k) in the input.  ALTER TABLE portcoexitbuild RENAME TO portcoexitbuildbak;  
CREATE TABLE portcoexitbuild AS
SELECT coname, statecode, datefirstinv, ipoissuedate, masannounceddate, ipoprincipalamtkipoamt, mastransactionamtkmaamt, investedk
FROM companybaseipomasmaster;
--44740
DROP TABLE portcoexitmaster;
CREATE TABLE portcoexitmaster AS
SELECT coname, statecode, datefirstinv, investedk, CASE WHEN ipoissuedate IS NOT NULL THEN ipoissuedate WHEN masannounceddate IS NOT NULL THEN masannounceddate ELSE NULL END AS exitdate,
CASE WHEN ipoissuedate IS NOT NULL THEN 1::int ELSE 0::int END AS ipoflag,
CASE WHEN masannounceddate IS NOT NULL THEN 1::int ELSE 0::int END AS maflag,
CASE WHEN ipoissuedate IS NOT NULL OR masannounceddate IS NOT NULL THEN 1::int ELSE 0::int END AS exitflag,
CASE WHEN ipoissuedate IS NOT NULL THEN ipoprincipalamtkipoamt::numeric::float8/1000 ELSE mastransactionamtkmaamt::numeric::float8/1000 END AS
exitvaluem
FROM portcoexitbuild;

Navigation menu