Changes

Jump to navigation Jump to search
4,409 bytes added ,  17:31, 30 January 2018
END AS excludeflag
FROM round;
 
'''NOTES FROM ED'''
*There wasn't a colevel, portco keyed master flag table, so I made one from '''roundplus''' (see below)
*I also included basic round summary data
 
DROP TABLE PortCoFlagMaster;
CREATE TABLE PortCoFlagMaster AS
SELECT coname, statecode, datefirstinv, city
max(growthflag) AS hadgrowth, sum(growthflag) as numgrowth,
max(seedflag) AS hadseed, sum(seedflag) as numseed,
max(earlyflag) AS hadearly, sum(earlyflag) as numearly,
max(laterflag) AS hadlater, sum(laterflag) as numlater,
max(transactionflag) AS hadtrans, sum(transactionflag) as numtrans,
max(excludeflag) AS hadexcl, sum(excludeflag) as numexcl,
max(dealflag) AS haddeal,
COUNT(rounddate) as numrounds, sum(numinvestors) as totalinvestors
FROM roundplus
GROUP BY coname, statecode, datefirstinv, city;
--44721
 
Code is in FixingVCDB2.sql in Z:\VentureCapitalData\SDCVCData\vcdb2
 
==Fixing erroneous geo-coordinates==
Some of the geocoordinates in the db are dirty and point to locations in India, Eastern Europe. However, the company addresses exist. Isolate the dirty geo-coordinates and do a lookup using Geocode.py script. To isolate place a box around the continental US and flag all points that fall outside the box. Add back the points that are located in Hawaii and Puerto Rico. Then import back into db.
==Cleaning firmbase==
 
NOTE FROM MEGHANA:
 
*The base table - firmbase - was reloaded after it was found to be incorrectly normalized.
*[[Retrieving_US_VC_Data_From_SDC#VC_Funds]] was corrected
*The code below has been rerun!
 
First flag the undisclosed funds.
 
The firmbase table contains undisclosed firms. Add a flag and remove them. Then use firmname, statecode, foundingdate as the key for this table. Check that is valid and make your core table.
 
DROP TABLE firmbase1;
CREATE TABLE firmbase1 AS
SELECT *, CASE
ELSE 0::int END AS undisclosedflag
FROM firmbase;
--145679734
SELECT COUNT(*) FROM firmbase1 WHERE undisclosedflag = 0;
--141459452
SELECT COUNT(*) FROM (SELECT DISTINCT firmname, statecode, foundingdate FROM firmbase1 WHERE undisclosedflag = 0)a;
--141459452  DROP TABLE firmbasecore;
CREATE TABLE firmbasecore AS
SELECT * FROM firmbase1 WHERE undisclosedflag = 0;
--141459452
Instead we chose to use only firmname as the key because there were not too many duplicates. We remove the duplicates by selecting the lesser foundingdate.
GROUP BY firmname
HAVING COUNT(*) > 1;
--126
DROP TABLE firmbaseinclude;
INNER JOIN firmbaseduplicates AS d ON f.firmname = d.firmname
GROUP BY f.firmname;
--126
DROP TABLE firmbasecorefirmbasec; CREATE TABLE firmbasecore firmbasec AS SELECT l.*
FROM firmbase1 AS l
LEFT JOIN firmbaseinclude AS r ON r.firmname = l.firmname AND r.foundingdate = l.foundingdate
WHERE r.firmname IS NULL AND undisclosedflag = 0;
--141339446  DROP TABLE firmbasecore; CREATE TABLE firmbasecore AS SELECT * FROM firmbasec WHERE firmname NOT LIKE 'Amundi%';
SELECT COUNT(DISTINCT firmname) FROM firmbasecore;
--141339445 NOTE: We have dropped 'Amundi' for reasons specified in "Cleaning branchoffices," below. ==Cleaning branchoffices==  DROP TABLE branchofficesbase; CREATE TABLE branchofficesbase AS SELECT *, CASE WHEN firmname LIKE '%Undisclosed Firm%' THEN 1::int ELSE 0::int END AS undisclosedflag FROM branchoffices; --9734  SELECT COUNT(*) FROM branchofficesbase WHERE undisclosedflag = 0; --9452  DROP TABLE branchofficecore; CREATE TABLE branchofficecore AS SELECT * FROM branchofficesbase WHERE undisclosedflag = 0; --9452  DROP TABLE bocore; CREATE TABLE bocore AS SELECT * FROM branchofficecore WHERE firmname not like 'Amundi%'; --9450  SELECT COUNT(*) FROM (SELECT DISTINCT firmname FROM bocore WHERE bocity IS NOT NULL) AS T; --1962 There are no duplicates in firmbasecore. In firmbase, 'Amundi Private Equity Funds SA' is the only duplicate, but we dropped the one with the earlier founding date when we cleaned firmbase. Amundi has 2 branch offices in the same city (Bucharest) but different zip codes, which is why there are different counts when we use firmname/firmname,bocity vs firmname, bozip as our key. NOTE: We have dropped "Amundi" from firmbasecore, as well as branchofficecore, in order to use firmname as a valid key for bocore.
==Cleaning fundbase==
 
 
NOTE FROM ED:
*The base table - '''fundbase''' - was reloaded after it was found to be incorrectly normalized.
**All sorts of fields were copied down when they shouldn't have been.
*[[Retrieving_US_VC_Data_From_SDC#VC_Funds]] was corrected
*The code in this section has been rerun.
 
First flag the undisclosed funds.
DROP TABLE fundbase1;
CREATE TABLE fundbase1 AS
SELECT *, CASE
FROM fundbase;
--27588
 
SELECT COUNT(*) FROM fundbase1 WHERE undisclosedflag = 0;
--27097
 
SELECT COUNT(*) FROM (SELECT DISTINCT fundname, firstinvdate FROM fundbase1 WHERE undisclosedflag = 0)a;
--27097
 
You can see that fundname, firstinvdate is a good key. But we're going to use simply the fundname as a key because it will be easier to do join operations later.
 
DROP TABLE fundbasecore;
CREATE TABLE fundbasecore AS
SELECT *
SELECT COUNT(*) FROM (SELECT DISTINCT fundname FROM fundbase1 WHERE undisclosedflag = 0)a;
--27050
 
The plan is to grab all the duplicate fundnames and only include the ones with the MIN(closedate) AND MIN(lastinvdate) in the fundbasecore table.
 
DROP TABLE fundnameexclude;
CREATE TABLE fundnameexclude AS
FROM fundbase1 AS f
INNER JOIN fundinclude AS fu ON f.fundname = fu.fundname AND f.closedate = fu.closedate AND f.lastinvdate = fu.lastinvdate;
--4417
--create fundcore table
UNION ALL
SELECT * FROM fundinclude2;
--2704727020
==Name based matching firms to funds==
SELECT fundname, firstinvdate, firmname
FROM fundbasecore;
--2709727020
\COPY fundkeysandfirms TO 'fundkeysandfirms.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
FROM firmbasecore;
\COPY firmkeys TO 'firmkeys.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
--141459446
CREATE TABLE matcherfirmsfunds (
\COPY matcherfirmsfunds FROM 'matcheroutputfundsfirms.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
--2364
 
==Joining firms with funds==
DROP TABLE firmfundstestjoin;
==Creating portcoexitmaster==
Portcoexitmaster contains the portcokey with an exitflag, ipoflag and maflag and an exit value. It is built off the companybaseipomasmaster table so be sure you've built this first.Note that the amounts are actually in m (not k) in the input.  ALTER TABLE portcoexitbuild RENAME TO portcoexitbuildbak;  
CREATE TABLE portcoexitbuild AS
SELECT coname, statecode, datefirstinv, ipoissuedate, masannounceddate, ipoprincipalamtkipoamt, mastransactionamtkmaamt, investedk
FROM companybaseipomasmaster;
--44740
DROP TABLE portcoexitmaster;
CREATE TABLE portcoexitmaster AS
SELECT coname, statecode, datefirstinv, investedk, CASE WHEN ipoissuedate IS NOT NULL THEN ipoissuedate WHEN masannounceddate IS NOT NULL THEN masannounceddate ELSE NULL END AS exitdate,
CASE WHEN ipoissuedate IS NOT NULL THEN 1::int ELSE 0::int END AS ipoflag,
CASE WHEN masannounceddate IS NOT NULL THEN 1::int ELSE 0::int END AS maflag,
CASE WHEN ipoissuedate IS NOT NULL OR masannounceddate IS NOT NULL THEN 1::int ELSE 0::int END AS exitflag,
CASE WHEN ipoissuedate IS NOT NULL THEN ipoprincipalamtkipoamt::numeric::float8/1000 ELSE mastransactionamtkmaamt::numeric::float8/1000 END AS
exitvaluem
FROM portcoexitbuild;
--22374
\COPY roundleveloutput2 TO 'roundleveloutput2.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
 
===Creating a PortCoMaster Table===
 
Using '''sdccompanybasecore2''' as a base table, Ed created a useful PortCoMasterTable. This table isn't perfect:
*sdccompanybasecore2 was drawn later and has 44966. It has fixed addresses (without copy down mistakes)
*The other tables - PortCoDeadAliveMaster, PortCoExitMaster, and PortCoFlagMaster - are built off slightly earlier draws and so have 44740 records
-The other tables are all keyed by coname, statecode and datefirstinv.
 
DROP TABLE PortCoMaster;
CREATE TABLE PortCoMaster AS
SELECT A.*,
B.deaddate, B.aliveyear, B.deadyear,
C.ipoflag, C.maflag, C.exitflag, C.exitvaluem,
D.hadgrowth, D.numgrowth, D.hadseed, D.numseed, D.hadearly, D.numearly, D.hadlater, D.numlater, D.hadtrans,
D.numtrans, D.hadexcl, D.numexcl, D.haddeal, D.numrounds, D.totalinvestors
FROM sdccompanybasecore2 AS A
LEFT JOIN PortCoDeadAliveMaster AS B ON A.coname=B.coname AND A.statecode=B.statecode AND
A.datefirstinv=B.datefirstinv
LEFT JOIN PortCoExitMaster AS C ON A.coname=C.coname AND A.statecode=C.statecode AND A.datefirstinv=C.datefirstinv
LEFT JOIN PortCoFlagMaster AS D ON A.coname=D.coname AND A.statecode=D.statecode AND A.datefirstinv=D.datefirstinv;
--44966
 
\COPY PortCoMaster TO 'PortCoMaster.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
--44966
==Cleaning roundline==

Navigation menu