Database System Concepts

Authors: Abraham Silberschatz, Henry F. Korth, and S. Sudarshan

ISBN: 978-0-07-352332-3

Table of Contents

In [1]:
%defaultDatasource jdbc:h2:mem:db

Appendix A: Detailed University Schema

E-R Diagram for University

In [2]:
CREATE TABLE IF NOT EXISTS classroom (
    building    VARCHAR(15),
    roomNumber  VARCHAR(7),
    capacity    NUMERIC(4,0),
    PRIMARY KEY (building, roomNumber)
);

CREATE TABLE IF NOT EXISTS department (
    deptName    VARCHAR(20),
    building    VARCHAR(15),
    budget      NUMERIC(12,2) CHECK (budget > 0),
    PRIMARY KEY (deptName)
);

CREATE TABLE IF NOT EXISTS course (
    courseId    VARCHAR(8),
    title       VARCHAR(50),
    deptName    VARCHAR(20),
    credits     NUMERIC(2,0) CHECK (credits > 0),
    PRIMARY KEY (courseId),
    FOREIGN KEY (deptName) REFERENCES department ON DELETE SET NULL
);

CREATE TABLE IF NOT EXISTS instructor (
    instructorId VARCHAR(5),
    name         VARCHAR(20) NOT NULL,
    deptName     VARCHAR(20),
    salary       NUMERIC(8,2) CHECK (salary > 29000),
    PRIMARY KEY (instructorId),
    FOREIGN KEY (deptName) REFERENCES department ON DELETE SET NULL
);

CREATE TABLE IF NOT EXISTS section (
    courseId    VARCHAR(8),
    sectionId   VARCHAR(8),
    semester    VARCHAR(6) CHECK (semester IN ('Fall', 'Winter', 'Spring', 'Summer')),
    year        NUMERIC(4,0) CHECK (year > 1701 AND year < 2100),
    building    VARCHAR(15),
    roomNumber  VARCHAR(7),
    timeSlotId  VARCHAR(4),
    PRIMARY KEY (courseId, sectionId, semester, year),
    FOREIGN KEY (courseId) REFERENCES course ON DELETE CASCADE,
    FOREIGN KEY (building, roomNumber) REFERENCES classroom ON DELETE SET NULL
);

CREATE TABLE IF NOT EXISTS teaches (
    instructorId VARCHAR(5),
    courseId     VARCHAR(8),
    sectionId    VARCHAR(8),
    semester     VARCHAR(6),
    year         NUMERIC(4,0),
    PRIMARY KEY (instructorId, courseId, sectionId, semester, year),
    FOREIGN KEY (courseId, sectionId, semester, year) REFERENCES section ON DELETE CASCADE,
    FOREIGN KEY (instructorId) REFERENCES instructor ON DELETE CASCADE
);

CREATE TABLE IF NOT EXISTS student (
    studentId   VARCHAR(5),
    name        VARCHAR(20) NOT NULL,
    deptName    VARCHAR(20),
    totalCredit NUMERIC(3,0) CHECK (totalCredit >= 0),
    PRIMARY KEY (studentId),
    FOREIGN KEY (deptName) REFERENCES department ON DELETE SET NULL
);

CREATE TABLE IF NOT EXISTS takes (
    studentId   VARCHAR(5),
    courseId    VARCHAR(8),
    sectionId   VARCHAR(8),
    semester    VARCHAR(6),
    year        NUMERIC(4,0),
    grade       VARCHAR(2),
    PRIMARY KEY (studentId, courseId, sectionId, semester, year),
    FOREIGN KEY (courseId, sectionId, semester, year) REFERENCES section ON DELETE CASCADE,
    FOREIGN KEY (studentID) REFERENCES student ON DELETE CASCADE
);

CREATE TABLE IF NOT EXISTS advisor (
    studentId    VARCHAR(5),
    instructorId VARCHAR(5),
    PRIMARY KEY (studentId),
    FOREIGN KEY (studentId) REFERENCES student ON DELETE CASCADE,
    FOREIGN KEY (instructorId) REFERENCES instructor ON DELETE SET NULL
);

CREATE TABLE IF NOT EXISTS prereq (
    courseId    VARCHAR(8),
    prereqId    VARCHAR(8),
    PRIMARY KEY (courseId, prereqId),
    FOREIGN KEY (courseId) REFERENCES course ON DELETE CASCADE,
    FOREIGN KEY (prereqId) REFERENCES course
);

CREATE TABLE timeSlot (
    timeSlotId  VARCHAR(4),
    day         VARCHAR(1),
    startHr     NUMERIC(2) CHECK (startHr >= 0 AND startHr < 24),
    startMin    NUMERIC(2) CHECK (startMin >= 0 AND startMin < 60),
    endHr       NUMERIC(2) CHECK (endHr >= 0 AND endHr < 24),
    endMin      NUMERIC(2) CHECK (endMin >= 0 AND endMin < 60),
    PRIMARY KEY (timeSlotId, day, startHr, startMin)
);
In [3]:
-- Clear Tables
DELETE FROM prereq;
DELETE FROM timeSlot;
DELETE FROM advisor;
DELETE FROM takes;
DELETE FROM student;
DELETE FROM teaches;
DELETE FROM section;
DELETE FROM instructor;
DELETE FROM course;
DELETE FROM department;
DELETE FROM classroom;
-- Classroom
INSERT INTO classroom VALUES ('Packard', '101', '500');
INSERT INTO classroom VALUES ('Painter', '514', '10');
INSERT INTO classroom VALUES ('Taylor', '3128', '70');
INSERT INTO classroom VALUES ('Watson', '100', '30');
INSERT INTO classroom VALUES ('Watson', '120', '50');
-- Department
INSERT INTO department VALUES ('Biology', 'Watson', '90000');
INSERT INTO department VALUES ('Comp. Sci.', 'Taylor', '100000');
INSERT INTO department VALUES ('Elec. Eng.', 'Taylor', '85000');
INSERT INTO department VALUES ('Finance', 'Painter', '120000');
INSERT INTO department VALUES ('History', 'Painter', '50000');
INSERT INTO department VALUES ('Music', 'Packard', '80000');
INSERT INTO department VALUES ('Physics', 'Watson', '70000');
-- Course
INSERT INTO course VALUES ('BIO-101', 'Intro. to Biology', 'Biology', '4');
INSERT INTO course VALUES ('BIO-301', 'Genetics', 'Biology', '4');
INSERT INTO course VALUES ('BIO-399', 'Computational Biology', 'Biology', '3');
INSERT INTO course VALUES ('CS-101', 'Intro. to Computer Science', 'Comp. Sci.', '4');
INSERT INTO course VALUES ('CS-190', 'Game Design', 'Comp. Sci.', '4');
INSERT INTO course VALUES ('CS-315', 'Robotics', 'Comp. Sci.', '3');
INSERT INTO course VALUES ('CS-319', 'Image Processing', 'Comp. Sci.', '3');
INSERT INTO course VALUES ('CS-347', 'Database System Concepts', 'Comp. Sci.', '3');
INSERT INTO course VALUES ('EE-181', 'Intro. to Digital Systems', 'Elec. Eng.', '3');
INSERT INTO course VALUES ('FIN-201', 'Investment Banking', 'Finance', '3');
INSERT INTO course VALUES ('HIS-351', 'World History', 'History', '3');
INSERT INTO course VALUES ('MU-199', 'Music Video Production', 'Music', '3');
INSERT INTO course VALUES ('PHY-101', 'Physical Principles', 'Physics', '4');
-- Instructor
INSERT INTO instructor VALUES ('10101', 'Srinivasan', 'Comp. Sci.', '65000');
INSERT INTO instructor VALUES ('12121', 'Wu', 'Finance', '90000');
INSERT INTO instructor VALUES ('15151', 'Mozart', 'Music', '40000');
INSERT INTO instructor VALUES ('22222', 'Einstein', 'Physics', '95000');
INSERT INTO instructor VALUES ('32343', 'El Said', 'History', '60000');
INSERT INTO instructor VALUES ('33456', 'Gold', 'Physics', '87000');
INSERT INTO instructor VALUES ('45565', 'Katz', 'Comp. Sci.', '75000');
INSERT INTO instructor VALUES ('58583', 'Califieri', 'History', '62000');
INSERT INTO instructor VALUES ('76543', 'Singh', 'Finance', '80000');
INSERT INTO instructor VALUES ('76766', 'Crick', 'Biology', '72000');
INSERT INTO instructor VALUES ('83821', 'Brandt', 'Comp. Sci.', '92000');
INSERT INTO instructor VALUES ('98345', 'Kim', 'Elec. Eng.', '80000');
-- Section
INSERT INTO section VALUES ('BIO-101', '1', 'Summer', '2009', 'Painter', '514', 'B');
INSERT INTO section VALUES ('BIO-301', '1', 'Summer', '2010', 'Painter', '514', 'A');
INSERT INTO section VALUES ('CS-101', '1', 'Fall', '2009', 'Packard', '101', 'H');
INSERT INTO section VALUES ('CS-101', '1', 'Spring', '2010', 'Packard', '101', 'F');
INSERT INTO section VALUES ('CS-190', '1', 'Spring', '2009', 'Taylor', '3128', 'E');
INSERT INTO section VALUES ('CS-190', '2', 'Spring', '2009', 'Taylor', '3128', 'A');
INSERT INTO section VALUES ('CS-315', '1', 'Spring', '2010', 'Watson', '120', 'D');
INSERT INTO section VALUES ('CS-319', '1', 'Spring', '2010', 'Watson', '100', 'B');
INSERT INTO section VALUES ('CS-319', '2', 'Spring', '2010', 'Taylor', '3128', 'C');
INSERT INTO section VALUES ('CS-347', '1', 'Fall', '2009', 'Taylor', '3128', 'A');
INSERT INTO section VALUES ('EE-181', '1', 'Spring', '2009', 'Taylor', '3128', 'C');
INSERT INTO section VALUES ('FIN-201', '1', 'Spring', '2010', 'Packard', '101', 'B');
INSERT INTO section VALUES ('HIS-351', '1', 'Spring', '2010', 'Painter', '514', 'C');
INSERT INTO section VALUES ('MU-199', '1', 'Spring', '2010', 'Packard', '101', 'D');
INSERT INTO section VALUES ('PHY-101', '1', 'Fall', '2009', 'Watson', '100', 'A');
-- Teaches
INSERT INTO teaches VALUES ('10101', 'CS-101', '1', 'Fall', '2009');
INSERT INTO teaches VALUES ('10101', 'CS-315', '1', 'Spring', '2010');
INSERT INTO teaches VALUES ('10101', 'CS-347', '1', 'Fall', '2009');
INSERT INTO teaches VALUES ('12121', 'FIN-201', '1', 'Spring', '2010');
INSERT INTO teaches VALUES ('15151', 'MU-199', '1', 'Spring', '2010');
INSERT INTO teaches VALUES ('22222', 'PHY-101', '1', 'Fall', '2009');
INSERT INTO teaches VALUES ('32343', 'HIS-351', '1', 'Spring', '2010');
INSERT INTO teaches VALUES ('45565', 'CS-101', '1', 'Spring', '2010');
INSERT INTO teaches VALUES ('45565', 'CS-319', '1', 'Spring', '2010');
INSERT INTO teaches VALUES ('76766', 'BIO-101', '1', 'Summer', '2009');
INSERT INTO teaches VALUES ('76766', 'BIO-301', '1', 'Summer', '2010');
INSERT INTO teaches VALUES ('83821', 'CS-190', '1', 'Spring', '2009');
INSERT INTO teaches VALUES ('83821', 'CS-190', '2', 'Spring', '2009');
INSERT INTO teaches VALUES ('83821', 'CS-319', '2', 'Spring', '2010');
INSERT INTO teaches VALUES ('98345', 'EE-181', '1', 'Spring', '2009');
-- Student
INSERT INTO student VALUES ('00128', 'Zhang', 'Comp. Sci.', '102');
INSERT INTO student VALUES ('12345', 'Shankar', 'Comp. Sci.', '32');
INSERT INTO student VALUES ('19991', 'Brandt', 'History', '80');
INSERT INTO student VALUES ('23121', 'Chavez', 'Finance', '110');
INSERT INTO student VALUES ('44553', 'Peltier', 'Physics', '56');
INSERT INTO student VALUES ('45678', 'Levy', 'Physics', '46');
INSERT INTO student VALUES ('54321', 'Williams', 'Comp. Sci.', '54');
INSERT INTO student VALUES ('55739', 'Sanchez', 'Music', '38');
INSERT INTO student VALUES ('70557', 'Snow', 'Physics', '0');
INSERT INTO student VALUES ('76543', 'Brown', 'Comp. Sci.', '58');
INSERT INTO student VALUES ('76653', 'Aoi', 'Elec. Eng.', '60');
INSERT INTO student VALUES ('98765', 'Bourikas', 'Elec. Eng.', '98');
INSERT INTO student VALUES ('98988', 'Tanaka', 'Biology', '120');
-- Takes
INSERT INTO takes VALUES ('00128', 'CS-101', '1', 'Fall', '2009', 'A');
INSERT INTO takes VALUES ('00128', 'CS-347', '1', 'Fall', '2009', 'A-');
INSERT INTO takes VALUES ('12345', 'CS-101', '1', 'Fall', '2009', 'C');
INSERT INTO takes VALUES ('12345', 'CS-190', '2', 'Spring', '2009', 'A');
INSERT INTO takes VALUES ('12345', 'CS-315', '1', 'Spring', '2010', 'A');
INSERT INTO takes VALUES ('12345', 'CS-347', '1', 'Fall', '2009', 'A');
INSERT INTO takes VALUES ('19991', 'HIS-351', '1', 'Spring', '2010', 'B');
INSERT INTO takes VALUES ('23121', 'FIN-201', '1', 'Spring', '2010', 'C+');
INSERT INTO takes VALUES ('44553', 'PHY-101', '1', 'Fall', '2009', 'B-');
INSERT INTO takes VALUES ('45678', 'CS-101', '1', 'Fall', '2009', 'F');
INSERT INTO takes VALUES ('45678', 'CS-101', '1', 'Spring', '2010', 'B+');
INSERT INTO takes VALUES ('45678', 'CS-319', '1', 'Spring', '2010', 'B');
INSERT INTO takes VALUES ('54321', 'CS-101', '1', 'Fall', '2009', 'A-');
INSERT INTO takes VALUES ('54321', 'CS-190', '2', 'Spring', '2009', 'B+');
INSERT INTO takes VALUES ('55739', 'MU-199', '1', 'Spring', '2010', 'A-');
INSERT INTO takes VALUES ('76543', 'CS-101', '1', 'Fall', '2009', 'A');
INSERT INTO takes VALUES ('76543', 'CS-319', '2', 'Spring', '2010', 'A');
INSERT INTO takes VALUES ('76653', 'EE-181', '1', 'Spring', '2009', 'C');
INSERT INTO takes VALUES ('98765', 'CS-101', '1', 'Fall', '2009', 'C-');
INSERT INTO takes VALUES ('98765', 'CS-315', '1', 'Spring', '2010', 'B');
INSERT INTO takes VALUES ('98988', 'BIO-101', '1', 'Summer', '2009', 'A');
INSERT INTO takes VALUES ('98988', 'BIO-301', '1', 'Summer', '2010', null);
-- Advisor
INSERT INTO advisor VALUES ('00128', '45565');
INSERT INTO advisor VALUES ('12345', '10101');
INSERT INTO advisor VALUES ('23121', '76543');
INSERT INTO advisor VALUES ('44553', '22222');
INSERT INTO advisor VALUES ('45678', '22222');
INSERT INTO advisor VALUES ('76543', '45565');
INSERT INTO advisor VALUES ('76653', '98345');
INSERT INTO advisor VALUES ('98765', '98345');
INSERT INTO advisor VALUES ('98988', '76766');
-- Time Slot
INSERT INTO timeSlot VALUES ('A', 'M', '8', '0', '8', '50');
INSERT INTO timeSlot VALUES ('A', 'W', '8', '0', '8', '50');
INSERT INTO timeSlot VALUES ('A', 'F', '8', '0', '8', '50');
INSERT INTO timeSlot VALUES ('B', 'M', '9', '0', '9', '50');
INSERT INTO timeSlot VALUES ('B', 'W', '9', '0', '9', '50');
INSERT INTO timeSlot VALUES ('B', 'F', '9', '0', '9', '50');
INSERT INTO timeSlot VALUES ('C', 'M', '11', '0', '11', '50');
INSERT INTO timeSlot VALUES ('C', 'W', '11', '0', '11', '50');
INSERT INTO timeSlot VALUES ('C', 'F', '11', '0', '11', '50');
INSERT INTO timeSlot VALUES ('D', 'M', '13', '0', '13', '50');
INSERT INTO timeSlot VALUES ('D', 'W', '13', '0', '13', '50');
INSERT INTO timeSlot VALUES ('D', 'F', '13', '0', '13', '50');
INSERT INTO timeSlot VALUES ('E', 'T', '10', '30', '11', '45 ');
INSERT INTO timeSlot VALUES ('E', 'R', '10', '30', '11', '45 ');
INSERT INTO timeSlot VALUES ('F', 'T', '14', '30', '15', '45 ');
INSERT INTO timeSlot VALUES ('F', 'R', '14', '30', '15', '45 ');
INSERT INTO timeSlot VALUES ('G', 'M', '16', '0', '16', '50');
INSERT INTO timeSlot VALUES ('G', 'W', '16', '0', '16', '50');
INSERT INTO timeSlot VALUES ('G', 'F', '16', '0', '16', '50');
INSERT INTO timeSlot VALUES ('H', 'W', '10', '0', '12', '30');
-- Prereq
INSERT INTO prereq VALUES ('BIO-301', 'BIO-101');
INSERT INTO prereq VALUES ('BIO-399', 'BIO-101');
INSERT INTO prereq VALUES ('CS-190', 'CS-101');
INSERT INTO prereq VALUES ('CS-315', 'CS-101');
INSERT INTO prereq VALUES ('CS-319', 'CS-101');
INSERT INTO prereq VALUES ('CS-347', 'CS-101');
INSERT INTO prereq VALUES ('EE-181', 'PHY-101');

Chapter 1: Introduction (TOC)

  • A database-management system (DBMS) consists of a collection of interrelated data and a collection of programs to access that data.

1.3 View of Data

Data Abstraction

  • Physical Level: How is the data stored in the database?
  • Logical Level: What is the data stored in the database?
  • View Level: How should users of the database interact with the data?

Instances and Schemas

  • Instance: A collection of information stored in the database at a particular moment.
  • Schema: The overall design of the database.
    • Physical Schema: Describes the database design at the physical level.
    • Logical Schema: Describes the database design at the logical level.
  • Physical Data Independence: The ability to modify the physical schema without changing the logical schema or the application.
  • Logical Data Independence: The ability to modify the logical schema without changing the application.

Data Models

  • Data Model: A collection of conceptual tools for describing data, data relationships, data semantics, and consistency constraints.
    • Relational Model
    • Entity-Relationship Model
    • Object-Based Data Model
    • Semistructured Data Model

1.9 Database Architecture

Database Architecture

Chapter 2: Introduction to the Relational Model (TOC)

2.1 Structure of Relational Databases

  • Relation: A set of tuples $\left( A_1, A_2, ..., A_n \right)$ in which each attribute $A_i$ is a member of the domain $D_i$.
    • Relation = Table
    • Tuple = Row
    • Attribute = Column
  • Relation Instance: A specific set of tuples of a relation.
  • Domain: A set of permitted values of an attribute.
  • Atomic: A domain in which every element is semantically indivisible.
  • Null: A special value that signifies that the value is unknown or does not exist.

2.2 Database Schema

  • Database Schema: The logical design of a database.
  • Database Instance: A snapshot of the data in a database at a given instant in time.
  • Relation Schema: A list of attributes and their corresponding domains.
$$R \left( A_1, A_2, ..., A_n \right)$$

2.3 Keys

  • A superkey of a relation is a set of one or more attributes whose values are guaranteed to identify tuples in the relation uniquely
  • A candidate key is a minimal superkey, that is, a set of attributes that forms a superkey, but none of whose subsets is a superkey.
  • A primary key is one of the candidate keys of a relation.
  • A foreign key is a set of attributes in a referencing relation, such that for each tuple in the referencing relation, the values of the foreign key attributes are guaranteed to occur as the primary key value of a tuple in the referenced relation.

Chapter 3: Introduction to SQL (TOC)

3.1 Overview of the SQL Query Language

  • Data-Definition Language (DDL) provides commands for defining relation schemas, deleting relations, and modifying relation schemas.
  • Data-Manipulation Language (DML) includes a query language and commands to insert tuples into, delete tuples from, and modify tuples in the database.

3.2 SQL Data Definition

Basic Types

In [4]:
-- INT
-- BOOLEAN
-- TINYINT
-- SMALLINT
-- BIGINT
-- IDENTITY
-- DECIMAL
-- DOUBLE
-- REAL
-- TIME
-- DATE
-- TIMESTAMP
-- TIMESTAMP WITH TIME ZONE
-- BINARY
-- OTHER
-- VARCHAR
-- VARCHAR_IGNORECASE
-- CHAR
-- BLOB
-- CLOB
-- UUID
-- ARRAY
-- ENUM
-- GEOMETRY
-- INTERVAL

Basic Schema Definition

In [5]:
-- Visit https://en.wikipedia.org/wiki/Data_definition_language
-- for more information regarding DDL.
-- CREATE TABLE r
-- (
-- Attribute1 Domain1,
-- Attribute2 Domain2,
-- ...
-- AttributeN DomainN,
-- (IntegrityConstraint1),
-- ...
-- (IntegrityConstraintk),
-- );

3.3 Basic Structure of SQL Queries

Select Clause and From Clause

  • The SELECT clause specifies the attributes to project for the output.
  • The FROM clause specifies the relation from which to query.
In [6]:
-- List attribute deptName of all relation instructor.
SELECT deptName
FROM instructor;
In [7]:
-- List attribute deptName of all relation instructor.
-- Disallow Duplicates
SELECT DISTINCT deptName
FROM instructor;
In [8]:
-- List attribute deptName of all relation instructor.
-- Allow Duplicates
SELECT ALL deptName
FROM instructor;

Arithmetic Operations

In [9]:
-- List 1.
SELECT 1.0;

-- List Addition.
SELECT 1.0 + 1.0;

-- List Subtraction
SELECT 1.0 - 1.0;

-- List Multiplication.
SELECT 2.0 * 2.0;

-- List Division;
SELECT 1.0 / 2.0;

* Attribute

  • The * symbol denotes all attributes.
In [10]:
SELECT * FROM instructor;

Where Clause

  • The WHERE clause specifies conditions by which the query should be filtered.
In [11]:
-- Connectives: 'and', 'or', 'not'
-- Operators: '<', '<=', '>', '>=', '=', '<>'
SELECT name
FROM instructor
WHERE deptName = 'Comp. Sci.' AND salary > 70000;

Cartesian Product

  • The Cartesian product outputs all pairs of rows from the two input relations (regardless of whether or not they have the same values on common attributes).
In [12]:
SELECT name, courseId
FROM instructor, teaches
WHERE instructor.instructorId = teaches.instructorId 
    AND instructor.deptName = 'Comp. Sci.';

Natural Join

  • The natural join outputs pairs of rows from the two input relations that have the same value on all attributes that have the same name.
In [13]:
SELECT name, courseId
FROM instructor NATURAL JOIN teaches
WHERE deptName = 'Comp. Sci.';

3.4 Additional Basic Operations

Rename Operation

  • The AS operation aliases attributes and relations for efficiency and disambiguity.
In [14]:
-- What are the names of all instructors whose salary 
-- is greater than at least one instructor in the Biology 
-- department?
SELECT DISTINCT T.name
FROM instructor AS T, instructor AS S
WHERE T.salary > S.salary AND S.deptName = 'Biology';

Order By Clause

  • The ORDER BY clause specifies the ordering by which the tuples in the result of a query should be sorted.
In [15]:
-- Alphabetically, what are the names of all instructors in the 
-- Physics department?
SELECT name
FROM instructor
WHERE deptName = 'Physics'
ORDER BY name;

-- Ascending
SELECT name
FROM instructor
WHERE deptName = 'Physics'
ORDER BY name ASC;

-- Descending
SELECT name
FROM instructor
WHERE deptName = 'Physics'
ORDER BY name DESC;

Where Clause 'Between' Predicate

In [16]:
-- What are the names of instructors with salary amounts 
-- between $90,000 and $100,000?
SELECT name
FROM instructor
WHERE salary BETWEEN 90000 AND 100000;

Where Clause 'Tuple Equality' Predicate

In [17]:
-- What are the instructor names and the courses they taught 
-- for all instructors in the Biology department who have 
-- taught some course?
SELECT name, courseId
FROM instructor, teaches
WHERE (instructor.instructorId, deptName) = (teaches.instructorId, 'Biology');

3.5 Set Operations

Union Operation

In [18]:
-- What are all the courses taught either in Fall 2009 or 
-- in Spring 2010, or both?
-- Disallow Duplicates
(SELECT courseId FROM section WHERE semester = 'Fall' AND year = 2009)
UNION
(SELECT courseId FROM section WHERE semester = 'Spring' AND year = 2010);
In [19]:
-- What are all the courses taught either in Fall 2009 or 
-- in Spring 2010, or both?
-- Allow Duplicates
(SELECT courseId FROM section WHERE semester = 'Fall' AND year = 2009)
UNION ALL
(SELECT courseId FROM section WHERE semester = 'Spring' AND year = 2010);

Intersect Operation

In [20]:
-- What are all the courses taught either in Fall 2009 and 
-- in Spring 2010?
-- Disallow Duplicates
(SELECT courseId FROM section WHERE semester = 'Fall' AND year = 2009)
INTERSECT
(SELECT courseId FROM section WHERE semester = 'Spring' AND year = 2010);
Out[20]:
CS-101

Except Operation

In [21]:
-- What are all the courses taught either in Fall 2009 but not 
-- in Spring 2010?
-- Disallow Duplicates
(SELECT courseId FROM section WHERE semester = 'Fall' AND year = 2009)
EXCEPT
(SELECT courseId FROM section WHERE semester = 'Spring' AND year = 2010);

3.6 Null Values

In [22]:
SELECT NULL;

-- Addition, All NULL
-- SELECT NULL + NULL;
-- SELECT 1.0 + NULL;
-- SELECT NULL + 1.0;

-- Subtraction, All NULL
-- SELECT NULL - NULL;
-- SELECT 1.0 - NULL;
-- SELECT NULL - 1.0;

-- Multiplication, All NULL
-- SELECT NULL * NULL;
-- SELECT 1.0 * NULL;
-- SELECT NULL * 1.0;

-- Division, All NULL
-- SELECT NULL / NULL;
-- SELECT 1.0 / NULL;
-- SELECT NULL / 1.0;

-- And
SELECT NULL AND TRUE;
SELECT NULL AND FALSE;

-- Or
SELECT NULL OR TRUE;
SELECT NULL OR FALSE;

-- Not
SELECT NOT NULL;

-- Is
SELECT NULL IS NULL;
SELECT NULL IS NOT NULL;

3.7 Aggregate Functions

Basic Aggregation

In [23]:
-- Average
SELECT AVG(salary)
FROM instructor;

-- Minimum
SELECT MIN(salary)
FROM instructor;

-- Maximum
SELECT MAX(salary)
FROM instructor;

-- Sum
SELECT SUM(salary)
FROM instructor;

-- Count
SELECT COUNT(*)
FROM instructor;

Distinct Aggregation

In [24]:
-- What is the total number of instructors who teach a course 
-- in Spring 2010 semester?
SELECT COUNT(DISTINCT instructorId)
FROM teaches
WHERE (semester, year) = ('Spring', 2010);
Out[24]:
6

Aggregation with Grouping

  • The GROUP BY clause specifies attributes by which tuples with the same value on all specified attributes are placed in the same group.
In [25]:
-- What is the average salary in each department?
SELECT deptName, AVG(salary) AS avgSalary
FROM instructor
GROUP BY deptName;

Having Clause

  • The HAVING clause specifies conditions by which the query should be filtered after groups have been formed.
In [26]:
-- What is the average salary in each department 
-- if the average salary is greater than $42,000?
SELECT deptName, AVG(salary) AS avgSalary
FROM instructor
GROUP BY deptName
HAVING AVG(salary) > 42000;

Aggregation with Null Values

  • All aggregate functions except COUNT(*) ignore null values in their input collection.

3.8 Nested Subqueries

  • A subquery is a select-from-where expression that is nested within another query.

Set Membership

  • The IN connective tests set membership.
  • The NOT IN connective tests the absense of set membership.
In [27]:
-- What are all the courses taught either in Fall 2009 and 
-- in Spring 2010?
SELECT DISTINCT courseId
FROM section
WHERE semester = 'Fall'
    AND year = 2009
    AND courseId IN (
        SELECT courseId
        FROM section
        WHERE semester = 'Spring'
            AND year = 2010
    );
Out[27]:
CS-101

Set Comparison

  • The SOME connective asserts a condition for any member of a set.
  • The ALL connective asserts a condition for all members of a set.
In [28]:
-- What are the names of all instructors whose salary 
-- is greater than at least one instructor in the Biology 
-- department?
SELECT name
FROM instructor
WHERE salary > SOME (
    SELECT salary
    FROM instructor
    WHERE deptName = 'Biology'
);

-- What are the names of all instructors whose salary 
-- is greater than all instructors in the Biology 
-- department?
SELECT name
FROM instructor
WHERE salary > ALL (
    SELECT salary
    FROM instructor
    WHERE deptName = 'Biology'
);

Existence Tests

  • The EXISTS connective asserts whether a set is empty.
  • The NOT EXISTS connective asserts whether a set is non-empty.
In [29]:
-- What are all the courses taught either in Fall 2009 and 
-- in Spring 2010?
SELECT courseId
FROM section AS S
WHERE semester = 'Fall'
    AND year = 2009
    AND EXISTS (
        SELECT *
        FROM section AS T
        WHERE semester = 'Spring'
            AND year = 2010
            AND S.courseId = T.courseId
    );
Out[29]:
CS-101

Uniqueness Tests

  • The UNIQUE connective asserts whether a set contains no duplicates.
  • The NOT UNIQUE connective asserts whether a set contains duplicates.
In [30]:
-- DOES NOT WORK WITH H2!
-- What are all the courses that were offered at most once 
-- in 2009?
-- SELECT T.courseId
-- FROM course AS T
-- WHERE UNIQUE (
--     SELECT R.courseId
--     FROM section AS R
--     WHERE T.courseId = R.courseId
--         AND R.year = 2009
-- );

With Clause

  • The WITH clause defines a temporary relation whose definition is available only to the query in which the clause occurs.
In [31]:
-- DOES NOT WORK WITH H2!
-- What department has the maximum budget?
-- WITH maxBudget(value) AS (
--     SELECT MAX(budget) FROM department
-- )
-- SELECT budget
-- FROM department, maxBudget
-- WHERE department.budget = maxBudget.value;

Scalary Subqueries

  • A scalar subquery is a subequery that returns only one tuple containing a single attribute.
  • A scalar subquery can be used wherever an expression returning a value is allowed.
In [32]:
-- What are all the departments along with the number of
-- instructors in each department?
SELECT deptName, (
    SELECT COUNT(*)
    FROM instructor
    WHERE department.deptName = instructor.deptName
) AS numInstructors
FROM department;

3.9 Modification of the Database

Deletion

  • The DELETE statement deletes all tuples in a relation for which a given predicate is true.
In [33]:
SET AUTOCOMMIT FALSE;

SELECT CONCAT('Setup: ', COUNT(*))
FROM instructor;

-- 
DELETE FROM instructor;
-- 
SELECT CONCAT('Test 1: ', COUNT(*)) 
FROM instructor;
-- 
ROLLBACK;

-- 
DELETE FROM instructor
WHERE deptName = 'Finance';
-- 
SELECT CONCAT('Test 2: ', COUNT(*))
FROM instructor
WHERE deptName = 'Finance';
-- 
ROLLBACK;

-- 
DELETE FROM instructor
WHERE salary BETWEEN 13000 AND 15000;
-- 
SELECT CONCAT('Test 3: ', COUNT(*)) 
FROM instructor
WHERE salary BETWEEN 13000 AND 15000;
-- 
ROLLBACK;

-- 
DELETE FROM instructor
WHERE deptName IN (
    SELECT deptName
    FROM department
    WHERE building = 'Watson'
);
-- 
SELECT CONCAT('Test 4: ', COUNT(*)) 
FROM instructor
WHERE deptName IN (
    SELECT deptName
    FROM department
    WHERE building = 'Watson'
);
-- 
ROLLBACK;

SELECT CONCAT('Teardown: ', COUNT(*))
FROM instructor;

SET AUTOCOMMIT TRUE;

Insertion

  • The INSERT statement inserts tuples into a relation.
In [34]:
SET AUTOCOMMIT FALSE;

SELECT CONCAT('Setup: ', COUNT(*))
FROM course;

-- 
INSERT INTO course
VALUES ('CS-437', 'Database Systems', 'Comp. Sci.', 4);
-- 
SELECT CONCAT('Test 1: ', COUNT(*)) 
FROM course;
-- 
ROLLBACK;

-- 
INSERT INTO course (courseId, title, deptName, credits)
VALUES ('CS-437', 'Database Systems', 'Comp. Sci.', 4);
-- 
SELECT CONCAT('Test 2: ', COUNT(*)) 
FROM course;
-- 
ROLLBACK;

SELECT CONCAT('Teardown: ', COUNT(*))
FROM course;

SET AUTOCOMMIT TRUE;

Update

  • The UPDATE statement updates tuples of a relation.
In [35]:
SET AUTOCOMMIT FALSE;

SELECT CONCAT('Setup: ', COUNT(*))
FROM instructor;

-- 
UPDATE instructor
SET salary = salary * 1.05;
-- 
ROLLBACK;

-- 
UPDATE instructor
SET salary = salary * 1.05
WHERE salary < 70000;
-- 
ROLLBACK;

SELECT CONCAT('Teardown: ', COUNT(*))
FROM instructor;

SET AUTOCOMMIT TRUE;

Chapter 4: Intermediate SQL (TOC)

4.1 Join Expressions

In [36]:
SELECT *
FROM student NATURAL JOIN takes;

Join Conditions

  • The JOIN ... USING clause specifies the required attributes to match for the join.
  • The JOIN ... ON <condition> clause specifies the required condition to satisfy for the join.

Inner Joins

  • The INNER JOIN do not preserve nonmatched tuples.
In [37]:
-- DOES NOT WORK WITH H2!
-- SELECT * 
-- FROM student INNER JOIN takes USING (studentId);

SELECT student.name, takes.courseId
FROM student INNER JOIN takes
    ON student.studentId = takes.studentId;

Outer Joins

  • The LEFT OUTER JOIN preserves tuples only in the relation left of the left outer join operation.
  • The RIGHT OUTER JOIN preserves tuples only in the relation right of the left outer join operation.
  • The FULL OUTER JOIN preserves tuples in both relations.
In [38]:
SELECT student.name, takes.courseId
FROM student LEFT OUTER JOIN takes
    ON student.studentId = takes.studentId;
    
SELECT student.name, takes.courseId
FROM student RIGHT OUTER JOIN takes
    ON student.studentId = takes.studentId;
    
-- DOES NOT WORK WITH H2!
-- SELECT student.name, takes.courseId
-- FROM student FULL OUTER JOIN takes
--     ON student.studentId = takes.studentId;

Chapter 6: Formal Relational Query Languages (TOC)

6.1 The Relational Algebra

Assignment Operation

  • Notation: $tempVariable \gets expression$
  • Definition: Assigns a relational-algebra expression into a temporary relation variable.

Select Operation

  • Notation: $\sigma_{predicate}(r)$
    • And: $\wedge$
    • Or: $\vee$
    • Not: $\neg$
  • Definition: Return rows of the input relation that satisfy the predicate. $$\sigma_{p}(r) = \left\{ t \mid t \in r \wedge p(t) \right\}$$
  • Example: What are all the instructors who are in the Physics department? $$\sigma_{deptName = 'Physics'}(instructor)$$

Project Operation

  • Notation: $\prod_{attribute_1, attribute_2, ..., attribute_k}(r)$
  • Definition: Output specfied attributes from all rows of the input relation. Remove duplicate tuples from the output.
  • Example: What are the IDs, names, and salaries of all instructors? $$\prod_{instructorId, name, salary}(instructor)$$

Union Operation

  • Notation: $r \cup s$
  • Definition: Output the union of tuples from the two input relations. $$r \cup s = \left\{ t \mid t \in r \vee t \in s \right\}$$
  • Example: What are all the courses that are taught in Fall 2009 or in Spring 2010, or in both? $$\prod_{courseId}\left( \sigma_{semester = 'Fall' \wedge year = 2009}(section) \right) \cup \prod_{courseId}\left( \sigma_{semester = 'Spring' \wedge year = 2010}(section) \right)$$

Set Difference Operation

  • Notation: $r - s$
  • Definition: Output the tuples that are contained in the first relation but not in the second relation. $$r - s = \left\{ t \mid t \in r \wedge t \notin s \right\}$$
  • Example: What are all the courses that are taught in Fall 2009 but not in Spring 2010? $$\prod_{courseId}\left( \sigma_{semester = 'Fall' \wedge year = 2009}(section) \right) - \prod_{courseId}\left( \sigma_{semester = 'Spring' \wedge year = 2010}(section) \right)$$

Cartesian Product Operation

  • Notation: $r \times s$
  • Definition: Output all pairs of rows from the two input relations. $$r \times s = \left\{ tq \mid t \in r \wedge q \in s \right\}$$
  • Example: What are all the courses that all the instructors who are in the Physics department teach? $$I \gets instructor$$ $$T \gets teaches$$ $$\prod_{name, courseId}\left( \sigma_{I.instructorId = T.instructorId}\left( \sigma_{deptName = 'Physics'}(I \times T) \right) \right)$$

Rename Operation

  • Notation: $\rho_{relation(attribute_1, attribute_2, ..., attribute_k)}(r)$
  • Definition: Output the relation with a new relation name and a set of new attribute names.
  • Example: What is the largest salary in the university? $$\prod_{salary} - \prod_{instructor.salary}\left( \sigma_{instructor.salary < d.salary}\left( instructor \times \rho_d(instructor) \right) \right)$$

Set Intersection Operation

  • Notation: $r \cap s$
  • Definition: Output the tuples that are contained in the first relation and in the second relation. $$r \cap s = r - (r - s)$$
  • Example: What are all the courses that are taught in Fall 2009 and in Spring 2010? $$\prod_{courseId}\left( \sigma_{semester = 'Fall' \wedge year = 2009}(section) \right) \cap \prod_{courseId}\left( \sigma_{semester = 'Spring' \wedge year = 2010}(section) \right)$$

Set Division Operation

  • Requirement: Every attribute of schema $S$ is in schema $R$.
  • Notation: $r \div s$
  • Definition: Outputs the largest relation $t(R - S)$ such that $t \times s \subseteq r$. $$r \div s = \prod_{R - S}(r) - \prod_{R - S}\left( \left( \prod_{R - S}(r) \times s \right) - r \right)$$
    • Relation $r \div s$ is a relation on schema $R - S$.
    • A tuple $t$ is in $r \div s$ if and only if both conditions hold:
      1. $t$ is in $\prod_{R - S}(r)$
      2. For every typle $t_s$ in $s$, there is a tuple $t_r$ in $r$ satisfying both of the following:
        • $t_r[S] = t_s[S]$
        • $t_r[R - S] = t$
  • Example: What are all the students who have taken all courses in the Biology department? $$\prod_{studentId, courseId}(takes) \div \prod_{courseId}\left( \sigma_{deptName = 'Biology'}(course) \right)$$
Advice
  1. The set division operation is suited to queries that include the phrase "for all".
  2. The set division operation is analogous to the following:
    1. Key $r$ by $R - S$ for groups of $t_r[R - S] \rightarrow g$.
    2. Output $t_r[R - S]$ for each $t_r[R - S] \rightarrow g$ where $s \subseteq g$.

Natural Join Operation

  • Notation: $r \Join s$
    • Theta Join: $\Join_\theta$
    • Left Outer Join: $⟕$
    • Right Outer Join: $⟖$
    • Full Outer Join: $⟗$
  • Definition: Output all pairs of rows from the two input relations that have the same value on each of the attributes in $R \cap S$. $$r \Join s = \prod_{r.A, r.B, r.C, r.D, s.E}\left( \sigma_{r.B = s.B \wedge r.D = s.D}(r \times s) \right)$$
  • Example: What are all the names of all instructors in the Physics department and all the courses that they teach? $$\prod_{name, title}\left( \sigma_{deptName = 'Physics'}(instructor \Join teaches \Join course) \right)$$

Chapter 7: Database Design and the E-R Model (TOC)

7.2 The Entity-Relationship Model

  • The entity-relationship (E-R) data model provides a convenient graphical representation to view data, relationships, and constraints.

Entity Sets

  • An entity is an object that exists in the real world and is distinguishable from other objects.
    • Each entity is associated with a set of attributes that describes the object.
  • An entity set is a collection of entities of the same type.

Entity Sets

Relationship Sets

  • A relationship is an association among several entities.
  • A relationship set is a collection of relationships of the same type.

Relationship Set

  • A relationship may have descriptive attributes that how entities are related.

Relationship Set with Descriptive Attribute

Attributes

  • Domain: Set of permitted values of an attribute.
  • Simple vs. Composite: Attributes are either atomic or composed of other component attributes.
  • Single-Valued vs. Multi-Valued: Attributes have either a single value or multiple values for a particular entity.
    • Single-Valued: Unique ID per Instructor
    • Multi-Valued: 1 or More Departments per Instructor

7.3 Constraints

Mapping Cardinalities

  • Mapping Cardinalities: The number of entities to which another entity can be associated via a relationship set.
  • Binary Relationships:
    • One-to-One
    • One-to-Many
    • Many-to-One
    • Many-to-Many

One-to-X Mapping

Many-to-X Mapping

Keys

  • Let $R$ be a relationship set involving entity sets $E_1, E_2, ..., E_n$.
  • Let $\text{PrimaryKey}(E_i)$ denote the set of attributes that form the primary key for entity set $E_i$.
    • Use Dotted Notation for Attribute Name Collisions
  • Superkey for Relationship Set: $$\text{PrimaryKey}(E_1) \cup \text{PrimaryKey}(E_2) \cup ... \cup \text{PrimaryKey}(E_n)$$

Binary Relationships between $A$ and $B$

  • One-to-One: $\text{PrimaryKey}(R) = \text{PrimaryKey}(A)$ or $\text{PrimaryKey}(R) = \text{PrimaryKey}(B)$
  • One-to-Many: $\text{PrimaryKey}(R) = \text{PrimaryKey}(B)$
  • Many-to-One: $\text{PrimaryKey}(R) = \text{PrimaryKey}(A)$
  • Many-to-Many: $\text{PrimaryKey}(R) = \text{PrimaryKey}(A) \cup \text{PrimaryKey}(B)$

7.5 Entity-Relationship Diagrams

Basic Structure

  • Rectangles: Entity Sets
    • Underlined Attributes: Primary Key
  • Diamonds: Relationship Sets
  • Undivided Rectangles: Descriptive Attributes of Relationship Set
  • Lines: Links Entity Sets $\rightarrow$ Relationship Sets
  • Dashed Lines: Links Attributes of Relationship Set $\rightarrow$ Relationship Set
  • Double Lines: Participation of Entity $\rightarrow$ Relationship Set
    • Total Participation: Every entity in the entity set participates in at least one relationship in the relationship set.
  • Double Diamonds: Relationship with Weak Entity Sets

E-R Diagram

Mapping Cardinality

E-R Diagram Relationships

E-R Diagram Cardinality Limits

Complex Attributes

E-R Diagram Complex Attributes

Role Indicator in Relationship

E-R Diagram Role Indicator

Weak Entity Sets

  • Weak Entity Set: An entity set that does not have sufficient attributes to form a primary key.
  • Identifying Entity Set: The entity set which a weak entity set must be associated.
  • Discriminator: Set of attributes that uniquely distinguishes all the entities in the weak entity set.

E-R Diagram with Weak Entity Set

  • The discriminator of a weak entity is underlined with a dashed line.
  • The relationship set connecting the weak entity set to the identifying strong entity set is depicted by a double diamond.

7.6 Reduction to Relational Schemas

Representation of Strong Entity Sets with Simple Attributes

  • A strong entity set with simple attributes maps directly to a relational schema. $$student(\underline{ID}, name, tot\_cred)$$

Representation of Strong Entity Sets with Complex Attributes

  • A strong entity set with complex attributes maps directly to a relational schema with complex attributes replaced with their component attributes. $$instructor(\underline{ID}, first\_name, middle\_initial, last\_name, street\_number, street\_name, apt\_number, city, state, zip\_code, date\_of\_birth)$$
Multi-Valued Attributes
  • A multi-valued attribute $M$ of an entity $E$ is represented by a separate relational schema $EM$ whose primary key is the union of the primary key of $E$ and the multi-valued attribute $M$. $$inst\_phone(\underline{ID}, \underline{phone\_number})$$

Representation of Weak Entity Sets

  • A weak entity set maps directly to a relational schema with the primary key of the identifying entity set.
  • The primary key of the relational schema is the union of the primary key of the identifying entity set and the discriminator of the weak entity set. $$section(\underline{course\_id}, \underline{sec\_id}, \underline{sem}, \underline{year})$$

Representation of Relationship Sets

  • Create a relational schema for every relationship set with attributes for the primary keys of the participating entity sets and all the descriptive attributes. $$advisor(\underline{s\_ID}, \underline{i\_ID})$$
    • One-to-One: Choose one of the participating entity sets and include only the primary keys of that entity set.
    • One-to-Many / Many-to-One: Include only the attributes for the primary keys of the "many" side.
    • Many-to-Many: Include the attributes for the primary keys of both participating entity sets.

Redundancy of Schemas

  • In many-to-one or one-to-many relationships, if the "many" side is in total participation, the relationship can be represented by adding an extra attribute to the "many" side containing the primary key of the "one" side.
  • The schema for the relationship set linking a weak entity set to its corresponding strong entity set is redundant and does not need to be present in a relational database design based upon an E-R diagram.
    • No relational schema is required for $sec\_course$.
Example

Example of Redundancy of Schemas

  • A relational schema for $inst\_dept$ is redundant. Instead $dept\_name$ should be added to the schema of $instructor$.

7.7 Entity-Relationship Design Issues

  1. Use of Attributes vs. Entity Sets
  2. Use of Entity Sets vs. Relationship Sets
  3. Use of Ternary Relationship vs. Pair of Binary Relationships
  4. Use of Weak Entity Set vs. Entity Set
  5. Use of Specialization/Generalization for Modularity
  6. Use of Aggregation for Abstraction

7.8 Extended E-R Features

Specialization and Generalization

  • Specialization and generalization define a containment relationship between a higher-level entity set and one or more lower-level entity sets.
  • Specialization is the result of taking a subset of a higher-level entity set to form a lower-level entity set.
  • Generalization is the result of taking the union of two or more disjoint (lower-level) entity sets to produce a higher-level entity set.
  • The attributes of higher-level entity sets are inherited by lower-level entity sets.

Specialization and Generalization

  • Ordinary Generalization: Separate Arrows from Lower to Higher.
    • Example: A Person can be both an Employee and a Student.
  • Disjoint Generalization: Combined Arrows from Lower to Higher.
    • Example: An Employee cannot be both an Instructor and a Secretary.

Aggregation

  • Aggregation: An abstraction in which relationship sets (along with their associated entity sets) are treated as higher-level entity sets, and can participate in relationships.

Chapter 8: Relational Database Design (TOC)

8.1 Features of Good Relational Designs

Bad Relational Database Design

Repetition of Information
  • A condition where the values of one attribute are determined by the values of another attribute in the same relation, and both values are repeated throughout the relation.
  • Problem: Increases the storage required for the relation.
  • Problem: Makes updating the relation more difficult.
Inability to Represent Information
  • A condition where a relationship exists among only a proper subset of the attributes in a relation.
  • Problem: All the unrelated attributes must be filled with null values otherwise a tuple without the unrelated information cannot be inserted into the relation.
Loss of Information
  • A condition which results from the decomposition of one relation into two relations and which cannot be combined to recreate the original relation.
  • Problem: Certain queries cannot be answered using the reconstructed relation that could have been answered using the original relation.

Lossless-Join Decomposition

  • A decomposition of relational schema $R$ into relational schemas $R_1$ and $R_2$ such that for every instance $r(R)$ corresponding with instances $r_1(R_1)$ and $r_2(R_2)$, $r = r_1 \Join r_2$ holds.
  • A decomposition $\{ R_1, R_2 \}$ is a lossless-join decomposition if $R_1 \cap R_2 \rightarrow R_1$ or $R_1 \cap R_2 \rightarrow R_2$.

8.2 Atomic Domains and First Normal Form

  • A domain is atomic if elements of the domain are considered to be indivisible units.
  • A relation schema $R$ is in first normal form (1NF) if the domains of all attributes of $R$ are atomic.

8.3 Decomposition Using Functional Dependencies

  • A legal instance is an instance of a relation that satisfies all such real-world constraints.
  • A superkey $K$ of $r(R)$ is a subset of $R$ if, in any legal instance of $r(R)$, for all pairs $t_1$ and $t_2$ of tuples in the instance of $r$ if $t_1 \neq t_2$, then $t_1[K] \neq t_2[K]$.

Definition of Functional Dependency

  • A functional dependency expresses constraints that uniquely identify the values of certain attributes. $$\alpha \rightarrow \beta$$
  • Given an instance of $r(R)$, the instance satisfies the functional dependency $\alpha \rightarrow \beta$ if for all pairs of tuples $t_1$ and $t_2$ in the instance such that $t_1[\alpha]$ and $t_2[\alpha]$, it is also the case that $t_1[\beta]$ and $t_2[\beta]$.
  • The functional dependency $\alpha \rightarrow \beta$ holds on schema $r(R)$ if, in every legal instance of $r(R)$ it satisfies the functional dependency.
  • A functional dependency is trivial if it is satisfied by all instances of a relation.
    • If $\beta \subseteq \alpha$, then $\alpha \rightarrow \beta$ is trivial.

Uses of Functional Dependencies

  1. Test relations to see if they are legal under a given set of functional dependencies.
  2. Specify constraints on the set of legal relations.

Closure of a Set of Functional Dependencies

  • Given a relational schema $r(R)$, a functional dependency $f$ on $R$ is logically implied by a set of functional dependencies $F$ on $r$ if every instance of $r(R)$ that satisfies $F$ also satisfies $f$.
  • The closure of $F$, denoted $F^+$, is the set of all functional dependencies logically implied by $F$.

Dependency Preservation

  • A decomposition is dependency preserving if and only if $(F_1 \cup F_2 \cup ... \cup F_n)^+ = F^+$.

Boyce-Codd Normal Form

  • A relation schema $R$ is in Boyce-Codd normal form (BCNF) with respect to a set $F$ of functional dependencies if for every dependency $\alpha \rightarrow \beta$ in $F^+$ such that $\alpha, \beta \subseteq R$, at least one of the following holds:
    • $\alpha \rightarrow \beta$ is trivial.
    • $\alpha$ is a superkey for $R$.

Third Normal Form

  • A relation schema $R$ is in third normal form (3NF) with respect to a set $F$ of functional dependencies if for every dependency $\alpha \rightarrow \beta$ in $F^+$ such that $\alpha, \beta \subseteq R$, at least one of the following holds
    • $\alpha \rightarrow \beta$ is trivial.
    • $\alpha$ is a superkey for $R$.
    • Each attribute $B$ in $\beta - \alpha$ is contained in a candidate key for $R$.

Second Normal Form

  • A non-prime attribute of a relation schema $R$ is an attribute that is not a part of any candidate key of $R$.
  • A relation schema $R$ is in second normal form (2NF) with respect to a set $F$ of functional dependencies if:
    • $R$ is in 1NF.
    • $R$ depends on the whole of every candidate key.
  • If every candidate key in $R$ has only one attribute, then $R$ is automatically in 2NF.

8.4 Functional-Dependency Theory

Armstrong's Axioms

  • Reflexivity: If $\beta \subseteq \alpha$, then $\alpha \rightarrow \beta$.
  • Augmentation: If $\alpha \rightarrow \beta$, then $\gamma\alpha \rightarrow \gamma\beta$.
  • Transitivity: If $\alpha \rightarrow \beta$ and $\beta \rightarrow \gamma$, then $\alpha \rightarrow \gamma$.

Additional Inference Rules

  • Union: If $\alpha \rightarrow \beta$ and $\alpha \rightarrow \gamma$, then $\alpha \rightarrow \beta\gamma$.
  • Decomposition: If $\alpha \rightarrow \beta\gamma$, then $\alpha \rightarrow \beta$ and $\alpha \rightarrow \gamma$.
  • Pseudotransitivity: If $\alpha \rightarrow \beta$ and $\gamma\beta \rightarrow \delta$, then $\gamma\alpha \rightarrow \delta$.

Computing $F^+$

Computing $F^+$

Closure of Attribute Sets

  • Given a set of attributes $\alpha$, the closure of $\alpha$ under $F$, denoted $\alpha^+$, is the set of attributes that are functionally determined by $\alpha$ under $F$.

Uses of Attribute Set Closure

  1. Test whether $\alpha$ is a superkey of $R$ by computing $\alpha^+$ and checking that $\alpha^+$ contains all attributes of $R$.
  2. Test whether a functional dependency $\alpha \rightarrow \beta$ holds on $R$, compute $\alpha^+$ and check that $\beta \subseteq \alpha^+$.
  3. Computing $F^+$:
    1. For each $\gamma \subseteq R$,
      1. Find the closure $\gamma^+$.
      2. For each $S \subseteq \gamma^+$, output the functional dependency $\gamma \rightarrow S$.

Computing $\alpha^+$

Computing $\alpha^+$

Canonical Cover

  • A canonical cover of $F$ is a minimal set of functional dependencies equivalent to $F$, having no redundant dependencies.

Extraneous Attributes

  • Given a set $F$ of functional dependencies and a functional dependency $\alpha \rightarrow \beta$ in $F$,
  • Attribute $A$ is extraneous in $\alpha$ if $A \in \alpha$ and $F$ logically implies $\left( F - \{ \alpha \rightarrow \beta \} \right) \cup \left\{ (\alpha - A) \rightarrow \beta \right\}$.
    • Compute $(\{ \alpha \} - A)^+$ using the functional dependencies in $F$.
    • $A$ is extraneous in $\alpha$ if and only if $(\{ \alpha \} - A)^+$ contains $\beta$.
  • Attribute $B$ is extraneous in $\beta$ if $B \in \beta$ and $\left( F - \{ \alpha \rightarrow \beta \} \right) \cup \left\{ \alpha \rightarrow (\beta - B) \right\}$ logically implies $F$.
    • Compute $\alpha^+$ using the functional dependencies in $\left( F - \{ \alpha \rightarrow \beta \} \right) \cup \left\{ \alpha \rightarrow (\beta - B) \right\}$.
    • $B$ is extraneous in $\beta$ if and only if $\alpha^+$ contains $B$.

Computing $F^C$

Computing $F^C$

8.5 Algorithms for Decomposition

General Test for BCNF

  1. Given a relation schema $R$ and a set of functional dependencies $F$,
  2. Compute attribute closures for all subsets of attributes of $R$ with respect to $F$.
  3. Examine the attribute closures and look for a dependency $\alpha \rightarrow \beta$ that violates BCNF.
  4. If no such dependency exists, then conclude that $R$ is in BCNF.

Alternative Test for BCNF

  1. If $F$ is a set of functional dependencies over ONLY the attributes of $R$,
  2. Examine each functional dependency $\alpha \rightarrow \beta$ and check whether that violates BCNF.

BCNF Decomposition

BCNF Decomposition

General Test for 3NF

  1. Given a relation schema $R$ and a set of functional dependencies $F$,
  2. Compute attribute closures for all subsets of attributes of $R$ with respect to $F$.
  3. Identify all candidate keys.
  4. Examine the attribute closures and look for a dependency $\alpha \rightarrow \beta$ that violates 3NF.
  5. If no such dependency exists, then conclude that $R$ is in 3NF.

Alternative Test for 3NF

  1. If $F$ is a set of functional dependencies over ONLY the attributes of $R$,
  2. Identify all candidate keys.
  3. Examine each functional dependency $\alpha \rightarrow \beta$ and check whether that violates 3NF.

3NF Decomposition

3NF Decomposition

Chapter 10: Storage and File Structure (TOC)

10.1 Overview of Physical Storage Media

Storage Device Hierarchy

  1. Cache
  2. Main Memory
  3. Flash Memory
  4. Magnetic-Disk Storage
  5. Optical Storage
  6. Tape Storage

Storage Types

  • Primary Storage: Cache, Main Memory
  • Secondary Storage (Online Storage): Flash Memory, Magnetic-Disk Storage
  • Tertiary Storage (Offline Storage): Optical Storage, Tape Storage
  • Volatile Storage: Losses Data without Power
  • Nonvolatile Storage: Preserves Data without Power

10.2 Magnetic Disk and Flash Storage

Magnetic Disk

Magnetic Disk

Performance Measures of Disks

  • Access Time: Time from when a read or write request is issued to when data transfer begins.
  • Seek Time: Time for repositioning the arm.
  • Rotational Latency Time: Time spent waiting for the sector to be accessed to appear under the head.
  • Average Latency Time: One-half the time for a full rotation of the disk.
  • Data-Transfer Rate: Rate at which data can be retrieved from or stored to the disk.
  • Mean Time to Failure (MTTF): Average time expected of the system to run continuously without any failure.

Optimization of Disk-Block Access

  • Block: A logical unit consisting of a fixed number of contiguous sectors, typically ~KBs.
  • Sequential Access: Successive Requests for Successive Block.
    • $1 \times Seek Time + N \times Transfer Time$
  • Random Access: Successive Requests for Random Blocks.
    • $N \times Seek Time + N \times Transfer Time$
Techniques
  • Buffering
  • Read-Ahead
  • Disk-Arm-Scheduling Algorithms (Elevator Algorithm)
  • File Organization & Fragmentation
  • Nonvolatile Write Buffers (NVRAM)
  • Log Disk (Journaling File Systems)

10.3 RAID

  • RAID: Redundant Arrays of Independent Disks

Improvement of Reliability via Redundancy

  • Mirroring: Duplicate Disks

Improvement in Performance via Parallelism

  • Data Striping: Write Data Across Multiple Disks
  • Bit-Level Striping: Bit $i$ to Disk $i$
  • Block-Level Striping Block $i$ to Disk $(i \mod n) + 1$

RAID Levels

  • RAID Level 5
    • Small Write Performance: $\frac{N}{N - 1} \times$
    • Large Write Performance: $(N - 1) \times$
    • Small Read Performance: $N \times$
    • Large Read Performance: $(N - 1) \times$
  • RAID Level 10: RAID Level 1 + RAID Level 0
    • Write Performance: $\frac{N}{2} \times$
    • Read Performance: $N \times$

RAID Levels

10.5 File Organization

  • File: Sequence of Records
  • Block: $\text{File} \rightarrow \text{Fixed-Length Blocks}$

Fixed-Length Records

  • Record $i$ to Bytes $n \times (i - 1)$
  • File Header: Stores Address of First Deleted Record
  • Free List: Deleted Record $i$ $\rightarrow$ Deleted Record $(i + 1)$

Variable-Length Records

  • Record = < Fixed Length Attributes, Null Bitmap, Variable Length Attributes >
    • Variable Length Attributes: Fixed Size (Offset, Length)
  • Slotted-Page Structure: Organizes Variable-Length Records via Header:
    1. Number of Record Entries
    2. End of Free Space in Block
    3. Location & Size of Each Record

10.6 Organization of Records in Files

  • Heap File Organization: Any record can be placed anywhere in the file where there is space for the record. There is no ordering of records.
  • Sequential File Organization: Records are stored in sequential order, according to the value of a "search key" of each record.
  • Hashing File Organization: A hash function is computed on some attribute of each record. The result of the hash function specifies in which block of the file the record should be placed.
  • Multitable Clustering File Organization: Records of several different relations are stored in the same file; further, related records of the different relations are stored on the same block, so that one I/O operation fetches related records from all the relations.

Sequential File Organization

  • Search Key: Any set of attributes by which a sequential file is sorted.
  • Insertion:
    1. Locate the position where the record is to be inserted.
      1. If free space, insert.
      2. If no free space, insert into an overflow block.
    2. Update the pointer chain.
  • Deletion: Pointer Chains

10.7 Data-Dictionary Storage

  • The data dictionary (system catalog) keeps track of metadata, that is data about data, such as relation names, attribute names and types, storage information, integrity constraints, and user information.

10.8 Database Buffer

  • Buffer: A part of main memory available for storage of copies of disk blocks.
  • Buffer Manager: Subsystem responsible for the allocation of buffer space.

Buffer Manager

  • Buffer Replacement Strategy:
    • Least Recently Used (LRU) for Sorting
    • Most Recently Used (MRU) for Nested-Loop Joins
  • Pinned Blocks: No Write-Back During Updates
  • Forced Output of Block: Write-Back for Resiliency

Chapter 11: Indexing and Hashing (TOC)

11.1 Basic Concepts

  • Ordered Indices: Based on a sorted ordering of the values.
  • Hash Indices: Based on a uniform distribution of values across a range of buckets to which values are assigned by a hash function.
  • Range Query: Find records with a range of attribute values.
  • Point Query: Find records with a specific attribute value.

11.2 Ordered Indices

  • Primary Index (Clustering Index): An index on a search key where the sort order of the search key matches the sort order of a relation.
  • Secondary Index (Nonclustering Index): An index on a search key where the sort order of the search key is different than the sort order of a relation.
    • Advantage: Improves Performance.
    • Disadvantage: Expensive Sequential Scan + Maintenance Overhead.
  • Index-Sequential File: A sequential file with a clustering index on the search key.
    • Disadvantage: Performance Degradation $\propto$ File Growth.

Dense and Sparse Indices

  • Dense Indices: Contain entries for every search-key value.
    • Advantage: Faster Lookups Than Sparse Indices
    • Secondary Indices Always Dense
  • Sparse Indices: Contain entries only for some search-key values.
    • Advantage: Less Maintenance Overhead

11.3 $B^+$-Tree Index Files

  • A $B^+$-tree index takes the form of a balanced tree, in which every path from the root of the tree to a leaf of the tree is of the same length.
  • The height of a $B^+$-tree is proportional to the logarithm to the base $N$ of the number of records in the relation, where each nonleaf node stores $N$ pointers.
    • The value of $N$ is often around 50 or 100.
  • $B^+$-trees are much shorter than other balanced binary-tree structures such as AVL trees, and therefore require fewer disk accesses to locate records.

Example of $B^+$-Tree

Example of $B^+$-Tree

$B^+$-Tree File Organization

  • $B^+$-trees for indexing a file containing records, can also organize records into a file.
  • Advantage: Cheaper Point Queries
  • Disadvantage: Pricier Table Scans

11.5 Multiple-Key Access

Indices on Multiple Keys

  • Composite Search Keys use Lexicographic Ordering: $$(a_1, a_2) < (b_1, b_2) \implies a_1 < b_1 \vee (a_1 = b_1 \wedge a_2 < b_2)$$
  • Efficient: $a_1 = b_1 \wedge a_2 = b_2$, $a_1 = b_1 \wedge a_2 < b_2$
  • Inefficient: $a_1 < b_1 \wedge a_2 = b_2$

11.6 Static Hashing

  • Hash File Organization: Obtain the address of the disk block containing a desired record directly by computing a function on the search-key value of the record.
  • Hash Index Organization: Organize the search keys, with their associated pointers into a hash file structure.
  • At design time, it is unknown precisely which search-key values will be stored in the file, a good hash function to choose is one that assigns search-key values to buckets such that the distribution is both uniform and random.
  • Static Hashing: Uses hash functions in which the set of bucket addresses is fixed.
    • Disadvantage: Cannot Accommodate Data Growth

Handling of Bucket Overflows

  • Bucket Overflow $\leftarrow$ Insufficient Buckets + Skew
  • Closed Hashing: Handle Bucket Overflows with Overflow Chaining
    • Overflow Chaining: A linked list of all the overflow buckets of a given bucket.
  • Open Hashing: Fixed Number of Buckets but Linear/Quadratic Probing, Double Hashing, Cuckoo Hashing for Reassignment

11.7 Dynamic Hashing

  • Dynamic Hashing: Allow hash functions to be modified dynamically to accommodate the growth or shrinkage of the database.

Extendable Hashing

  • Choose a uniform and random hash function $h$ that generates relatively large values ($b = 32$).
  • Use $0 \leq i \leq b$ prefix bits as the offset into a bucket address table.
  • Grow or shrink $i$ to split or coalesce buckets dynamically.
  • Allow multiple entries in the bucket address table to point to the same bucket,

Example of Extendable Hashing

Example of Extendable Hashing

Lookups in Extendable Hashing

  1. Compute $h(K_j) = X$.
  2. Use $i$ most significant bits as an offset into the bucket address table to lookup bucket.

Insertions in Extendable Hashing

  1. If bucket $j$ has free space, insert record.
  2. Else,
    • If $\geq 1$ pointers to bucket $j$,
      1. Split bucket $j$ into 2 buckets.
      2. Divide the data and pointers between the 2 buckets.
    • If $1$ pointers to bucket $j$,
      • If too many splits,
        1. Create an overflow bucket.
      • Else,
        1. Increment $i$.
        2. Double the size of the bucket address table.
        3. Replace each entry in the bucket address table with 2 entries that point to the same bucket.

Deletions in Extendable Hashing

  1. Remove record from bucket $j$.
  2. If bucket $j$ is empty, remove it from the bucket address table.
  3. If possible, coalesce a bucket with a neighbouring bucket that has the same hash prefix.
  4. Rarely, decrease the size of the bucket address table.

Chapter 12: Query Processing (TOC)

12.1 Overview

Steps in Query Processing

12.2 Measures of Query Cost

  • Cost Measure:
    • Number of Block Transfers ($t_T$ - Transfer Time for 1 Block)
    • Number of Disk Seeks ($t_S$ - Seek Time for 1 Block)

12.3 Selection Operation

  • Process simple selection operations by performing a linear scan, or by making use of indices.
  • Process complex selections by computing unions and intersections of the results of simple selections.

Cost Estimates for Selection Algorithms

Fixes
  • A3: $h_i * (t_T + t_S) + t_S + b * t_T$
  • A5: $h_i * (t_T + t_S) + t_S + b * t_T$
  • A6: $(h_i + k + n) * (t_T + t_S)$
    • $k$ block transfers and seeks to find other matching index entries in leaf level.

12.4 Sorting (Not Mandatory)

  • Sort relations larger than memory by the external sort–merge algorithm.

External Sort-Merge Algorithm

  1. A number of sorted runs are created; each run is sorted, but contains only some of the records of the relation.

External Sort-Merge 1

  1. The runs are merged.

External Sort-Merge 2

Cost Analysis of External Sort-Merge

  • Let $M$ be the number of blocks in the main memory buffer available for sorting.
  • Let $b_r$ be the number of blocks containing records of relation $r$.
  • Let $b_b$ be the number of blocks allocated to each run.
  • Number of Transfers: $b_r \left( 2 \left\lceil \log_{M - 1}(b_r / M) \right\rceil + 1 \right)$
  • Number of Seeks: $2 \lceil b_r / M \rceil + \lceil b_r / b_b \rceil \left( 2 \left\lceil \log_{M - 1}(b_r / M) \right\rceil + 1 \right)$

12.5 Join Operation

Nested-Loop Join

Nested-Loop Join

  • If the buffer can hold only one block of each relation,
    • Number of Transfers: $n_r * b_s + b_r$
    • Number of Seeks: $n_r + b_r$
  • If the buffer can hold the smaller relation or both relations,
    • Number of Transfers: $b_r + b_s$
    • Number of Seeks: $2$

Block Nested-Loop Join

Block Nested-Loop Join

  • If the buffer can hold only one block of each relation,
    • Number of Transfers: $b_r * b_s + b_r$
    • Number of Seeks: $2 * b_r$
  • If the buffer can hold the smaller relation or both relations,
    • Number of Transfers: $b_r + b_s$
    • Number of Seeks: $2$
  • If the buffer can hold $M$ blocks,
    • Read $M - 2$ blocks for the outer relation.
    • Read 1 block for the inner relaiton.
    • Number of Transfers: $\lceil b_r / (M - 2) \rceil * b_s + b_r$
    • Number of Seeks: $2 * \lceil b_r / (M - 2) \rceil$

Indexed Nested-Loop Join

Indexed Nested-Loop Join

  • If the buffer can hold only one block of each relation,
    • Cost: $b_r (t_T + t_S) + n_r * c$
      • Let $c$ be the average cost of traversing the index and fetching all the matching $s$ tuples for one tuple $t_r$ of $r$.

Sort-Merge Join (Not Mandatory)

Sort-Merge Join

  • If input relations $r$ and $s$ are already sorted on the join attributes,
    • Number of Transfers: $b_r + b_s$
    • Number of Seeks: $\lceil b_r / b_b \rceil + \lceil b_s / b_b \rceil$

Hash Join (Not Mandatory)

Hash Join

  • Recursive Partitioning: With a new hash function, each bucket generated by one pass is separately read and partitioned again in the next pass, to create smaller partitions.
    • Required: $M \leq n_h + 1$ or $M \leq (b_s / M) + 1$, approximately $M \leq \sqrt(b_s)$
  • If recursive partitioning is not required,
    • Let $n_h$ be the number of partition pairs.
    • Number of Transfers: $3(b_r + b_s) + 4n_h$
    • Number of Seeks: $2\left( \lceil b_r / b_b \rceil + \lceil b_s / b_b \rceil \right) + 2n_h$
  • If the value of $n_h$ is greater than or equal to the number of blocks of memory, recursive partitioning is required,
    • Number of Transfers: $2(b_r + b_s)\lceil \log_{M - 1}(b_s) - 1 \rceil + b_r + b_s$
    • Number of Seeks: $2\left( \lceil b_r / b_b \rceil + \lceil b_s / b_b \rceil \right)\lceil \log_{M - 1}(b_s) - 1 \rceil$

12.6 Other Operations

  • Duplicate elimination, projection, set operations (union, intersection, and difference), and aggregation can be done by sorting or by hashing.

Chapter 13: Query Optimization (Not Mandatory) (TOC)

13.1 Overview

  • Given a query, there are generally a variety of methods for computing the answer. It is the responsibility of the system to transform the query as entered by the user into an equivalent query that can be computed more efficiently.
  • The process of finding a good strategy for processing a query is called query optimization.

13.2 Transformation of Relational Expressions

  1. Conjunctive selection operations can be deconstructed into a sequence of individual selections. This transformation is referred to as a cascade of $\sigma$. $$\sigma_{\theta_1 \wedge \theta_2}(E) = \sigma_{\theta_1}\left( \sigma_{\theta_2}(E) \right)$$
  1. Selection operations are commutative. $$\sigma_{\theta_1}\left( \sigma_{\theta_2}(E) \right) = \sigma_{\theta_2}\left( \sigma_{\theta_1}(E) \right)$$
  1. Only the final operations in a sequence of projection operations are needed; the others can be omitted. This transformation can also be referred to as a cascade of $\prod$. $$\prod_{L_1}\left( \prod_{L_2}\left( ... \left( \prod_{L_n}\left( E \right) \right) ... \right) \right) = \prod_{L_1}(E)$$
  1. Selections can be combined with Cartesian products and theta joins.
    1. $\sigma_{\theta}(E_1 \times E_2) = E_1 \Join_{\theta} E_2$
    2. $\sigma_{\theta_1}(E_1 \Join_{\theta_2} E_2) = E_1 \Join_{\theta_1 \wedge \theta_2} E_2$
  1. Theta-join operations are commutative. $$E_1 \Join_{\theta} E_2 = E_2 \Join_{\theta} E_1$$
  1. Natural-join operations are associative. $$(E_1 \Join E_2) \Join E_3 = E_1 \Join (E_2 \Join E_3)$$
  1. Theta joins are associative. $$(E_1 \Join_{\theta_1} E_2) \Join_{\theta_2 \wedge \theta_3} E_3 = E_1 \Join_{\theta_1 \wedge \theta_3} (E_2 \Join_{\theta_2} E_3)$$
  1. The selection operation distributes over the theta-join operation under the following two conditions:
    1. It distributes when all the attributes in selection condition $\theta_0$ involve only the attributes of one of the expressions (say, $E_1$) being joined. $$\sigma_{\theta_0}(E_1 \Join_{\theta} E_2) = \left( \sigma_{\theta_0}(E_1) \right) \Join_{\theta} E_2$$
    2. It distributes when selection condition $\theta_1$ involves only the attributes of $E_1$ and $\theta_2$ involves only the attributes of $E_2$. $$\sigma_{\theta_1 \wedge \theta_2}(E_1 \Join_{\theta} E_2) = \left( \sigma_{\theta_1}(E_1) \right) \Join_{\theta} \left( \sigma_{\theta_2}(E_2) \right)$$
  1. The projection operation distributes over the theta-join operation under the following conditions.
    1. Let $L_1$ and $L_2$ be attributes of $E_1$ and $E_2$, respectively. Suppose that the join condition $\theta$ involves only attributes in $L_1 \cup L_2$. Then, $$\prod_{L_1 \cup L_2}(E_1 \Join_{\theta} E_2) = \left( \prod_{L_1}(E_1) \right) \Join_{\theta} \left( \prod_{L_2}(E_2) \right)$$
    2. Consider a join $E_1 \Join_{\theta} E_2$. Let $L_1$ and $L_2$ be sets of attributes from $E_1$ and $E_2$, respectively. Let $L_3$ be attributes of $E_1$ that are involved in join condition $\theta$, but are not in $L_1 \cup L_2$, and let $L_4$ be attributes of $E_2$ that are involved in join condition $\theta$, but are not in $L_1 \cup L_2$. Then, $$\prod_{L_1 \cup L_2}(E_1 \Join_{\theta} E_2) = \prod_{L_1 \cup L_2}\left( \left( \prod_{L_1 \cup L_3}(E_1) \right) \Join_{\theta} \left( \prod_{L_2 \cup L_4}(E_2) \right) \right)$$
  1. The set operations union and intersection are commutative. $$E_1 \cup E_2 = E_2 \cup E_1$$ $$E_1 \cap E_2 = E_2 \cap E_1$$
  1. Set union and intersection are associative. $$(E_1 \cup E_2) \cup E_3 = E_1 \cup (E_2 \cup E_3)$$ $$(E_1 \cap E_2) \cap E_3 = E_1 \cap (E_2 \cap E_3)$$
  1. The selection operation distributes over the union, intersection, and set-difference operations. $$\sigma_{p}(E_1 - E_2) = \sigma_{p}(E_1) - \sigma_{p}(E_2)$$ $$\sigma_{p}(E_1 \cup E_2) = \sigma_{p}(E_1) \cup \sigma_{p}(E_2)$$ $$\sigma_{p}(E_1 \cap E_2) = \sigma_{p}(E_1) \cap \sigma_{p}(E_2)$$
  1. The projection operation distributes over the union operation. $$\prod_{L}(E_1 \cup E_2) = \left( \prod_{L}(E_1) \right) \cup \left( \prod_{L}(E_2) \right)$$

Chapter 14: Transactions (TOC)

14.1 Transaction Concept

  • A transaction is an atomic unit of program that accesses and possibly updates various data items.
  • Atomicity: Either all operations of the transactions are reflected properly in the database, or none are.
  • Consistency: Execution of a transaction in isolation preserves the consistency of the database.
  • Isolation: Even though multiple transactions may execute concurrently, the system guarantees that, for every pair of transactions $T_i$ and $T_j$, it appears to $T_i$ that either $T_j$ finished execution before $T_i$ started or $T_j$ started execution after $T_i$ finished. Thus, each transaction is unaware of other transactions executing concurrently in the system.
  • Durability: After a transaction completes successfully, the changes it has made to the database persist, even if there are system failures.

14.4 Transaction Atomicity and Durability

Transaction States

  • Active: The initial state; the transaction stays in this state while executing.
  • Partially Committed: After the final statement has been executed.
  • Failed: After the discovery that normal execution can no longer proceed.
  • Aborted: After the transaction has been rolled back and the database has been restored to its state prior to the start of the transaction.
    1. Restart Transaction.
    2. Kill Transaction.
  • Committed: After successful completion.

State Diagram of a Transaction

14.5 Transaction Isolation

  • A schedule is a sequence of instructions that specify the chronological order in which instructions of concurrent transactions are executed.
  • A serial schedule is one in which a transaction that has been started runs to completion before another transaction may start.
  • A transaction that successfully completes its execution ends with a commit instruction.
  • A transcation that fails to successfully complete its execution ends with an abort instruction.

Schedule 1

14.6 Serializability

  • A schedule is serializable if it is equivalent to a serial schedule, so it preserves consistency.

Conflict

  • Given a schedule $S$ in which there are 2 consecutive instructions $I$ and $J$, of transactions $T_i$ and $T_j$, respectively ($i \neq j$),
  • If $I$ and $J$ refer to different data items, then we can swap $I$ and $J$ without affecting the results of any instruction in the schedule.
  • Else, $I$ and $J$ conflict if they are operations by different transactions on the same data item, and at least one of these instructions is a write operation.

Conflict Serializability

  • Let $S$ and $S'$ be schedules for some set $R$ of transactions.
  • If schedule $S$ can be transformed into schedule $S'$ by a series of swaps of NON-CONFLICTING INSTRUCTIONS, then we say that $S$ and $S'$ are conflict equivalent.
  • A schedule $S$ is conflict serializable if it is conflict equivalent to a serial schedule.

Conflict Serializability of Schedule 3 into Schedule 6

  • $T_1$'s $\text{read}(B), \text{write}(B)$ do not conflict with $T_2$'s $\text{read}(A), \text{write}(A)$, so they can be swapped.

Testing for Conflict Serializability

  • Precedence Graph: A directed graph where the vertices are transactions and edges are conflicting operations.
  • $T_i \rightarrow T_j$,
    1. $T_i$ executes $\text{write}(Q)$ before $T_j$ executes $\text{read}(Q)$.
    2. $T_i$ executes $\text{read}(Q)$ before $T_j$ executes $\text{write}(Q)$.
    3. $T_i$ executes $\text{write}(Q)$ before $T_j$ executes $\text{write}(Q)$.
  • If the precedence graph for $S$ has a cycle, then schedule $S$ is not conflict serializable.
  • Else, a serializability order of the transactions can be obtained by finding a linear order consistent with the partial order of the precedence graph by performing topological sort.

Topological Sort

14.7 Transaction Isolation and Atomicity

Recoverable Schedules

  • A recoverable schedule is one where, for each pair of transactions $T_i$ and $T_j$ such that $T_j$ reads a data item previously written by $T_i$, the commit operation of $T_i$ appears before the commit operation of $T_j$.

Nonrecoverable Schedule

Cascading Rollback

  • A cascading rollback occurs when a single transaction failure leads to a series of transaction rollbacks.

Cascading Rollback

Cascadeless Schedules

  • A cascadeless schedule is one where, for each pair of transactions $T_i$ and $T_j$ such that $T_j$ reads a data item previously written by $T_i$, the commit operation of $T_i$ appears before the read operation of $T_j$.
  • Every cascadeless schedule is a recoverable schedule.

14.8 Transaction Isolation Levels

Transaction Isolation Anomalies

  • Phantom Read: The results of a query in one transaction are changed by another transaction before the former commits.
  • Non-Repeatable Read: Repeated reads of the same record in one transaction return different values because of an update made by another transaction.
  • Dirty Read: One transaction reads a value written by another transaction that has not yet committed.

Transaction Isolation Levels

Isolation Level Allow Phantom Reads? Allow Non-Repeatable Reads? Allow Dirty Reads?
Serializable No No No
Repeatable Read Yes No No
Read Committed Yes Yes No
Read Uncommitted Yes Yes Yes

Chapter 15: Concurrency Control (TOC)

  • When several transactions execute concurrently in the database, the consistency of data may no longer be preserved.
  • It is necessary for the system to control the interaction among the concurrent transactions, and this control is achieved through one of a variety of mechanisms called concurrency-control schemes.

15.1 Lock-Based Protocols

Locks

  • Shared Lock: If a transaction $T_i$ has obtained a shared-mode lock (denoted by $S$) on item $Q$, then $T_i$ can read, but cannot write, $Q$.
  • Exclusive Lock: If a transaction $T_i$ has obtained an exclusive-mode lock (denoted by $X$) on item $Q$, then $T_i$ can both read and write $Q$.
  • A transaction may be granted a lock on an item if the requested lock is compatible with all locks already held on that item by other transactions.
  • A lock manager grants locks requested by a transaction, force a transaction to wait, or force a transaction to abort to avoid a deadlock.
  • A locking protocol is a set of rules indicating when a transaction may lock and unlock each of the data items.
    • Allow Only Conflict-Serializable Schedules

Lock-Compatibility Matrix Group

Granting of Locks

  • When a transaction $T_i$ requests a lock on a data item $Q$ in a particular mode $M$, the concurrency-control manager grants the lock provided that:
    1. There is no other transaction holding a lock on $Q$ in a mode that conflicts with $M$.
    2. There is no other transaction that is waiting for a lock on $Q$ and that made its lock request before $T_i$.
  • A lock request will never get blocked by a lock request that is made later to prevent starvation.

Schedule with Concurrency-Control Manager

Two-Phase Locking Protocol

  • The two-phase locking protocol allows a transaction to lock a new data item only if that transaction has not yet unlocked any data item.
  • The protocol ensures serializability, but not deadlock freedom.
  • In the absence of information concerning the manner in which data items are accessed, the two-phase locking protocol is both necessary and sufficient for ensuring serializability.
Phases
  1. Growing Phase: A transaction may obtain locks, but may not release any lock.
    • A shared lock can be upgraded into an exclusive lock without releasing the lock.
  2. Shrinking Phase: A transaction may release locks, but may not obtain new locks.
    • An exclusive lock can be downgraded into a shared lock without obtaining a new lock.

Strict Two-Phase Locking Protocol

  • The strict two-phase locking protocol permits release of exclusive locks only at the end of transaction, in order to ensure recoverability and cascadelessness of the resulting schedules.

Rigorous Two-Phase Locking Protocol

  • The rigorous two-phase locking protocol releases all locks only at the end of the transaction, for simplicity at the cost of reduced parallelism.

Automatic Acquisition of Locks

  • When a transaction $T_i$ issues a $\text{read}(Q)$ operation, the system issues a $\text{lock-s}(Q)$ instruction before the operation.
  • When a transaction $T_i$ issues a $\text{write}(Q)$ operation, the system checks to see whether $T_i$ already holds a shared lock on $Q$.
    • If it does, then the system issues an $\text{upgrade}(Q)$ instruction before the operation.
    • Else, the system issues a $\text{lock-x}(Q)$ instruction before the operation.
  • All locks obtained by a transaction are unlocked after that transaction commits or aborts.

Implementation of Locking

Lock Table

  • When a lock request arrives, add a record to the end of the linked list for the data item.
    • Always grant a lock request on a data item that is not currently locked.
    • If it is compatible with the locks that are currently held, and all earlier requests have been granted already, grant the lock request.
    • Else, the lock request has to wait.
  • When an unlock request arrives, delete the record for the corresponding data item.
    • Try to grant any waiting lock request.
  • If a transaction aborts, delete any waiting requests made by the transaction and release all locks held by the transaction.

15.2 Deadlock Handling

  • A deadlock prevention protocol ensures that the system will never enter a deadlock state.
  • A deadlock detection and a deadlock recovery scheme allow the system to enter a deadlock state and then try to recover.

Deadlock Prevention

  • Pre-Declaration: Each transaction locks all its data items before it begins execution.
  • Lock Ordering: Impose a partial order on all data items and require that a transaction lock data items in the order specified.
  • Timeout-Based: A transaction waits for a lock only for a specified amount of time, and then rolls back if the lock is not granted.
  • Wait-Die (Non-Preemptive)(Elders are Selfless):

    • When transaction $T_i$ requests a data item currently held by $T_j$, $T_i$ is allowed to wait only if it is older than $T_j$.
    • Else, younger $T_i$ rolls back (died).
    • Avoids Starvation
  • Wound-Wait (Preemptive)(Elders are Selfish):

    • When transaction $T_i$ requests a data item currently held by $T_j$, $T_i$ is allowed to wait only if it is younger than $T_j$.
    • Else, younger $T_j$ rolls back (wounded).
    • Avoids Starvation

Deadlock Detection

  • Wait-For Graph: A directed graph where the vertices are transactions and edges are wait-for dependencies.
  • A deadlock exists in the system if and only if the wait-for graph contains a cycle.
  • The system can recover from the deadlock by,
    1. Selecting a victim in the cycle who starved the least.
    2. Performing a total rollback or a partial rollback on the victim to break the cycle.

Wait-For Graph

15.3 Multiple Granularity

  • There are circumstances where it would be advantageous to group several data items, and to treat them as one aggregate data item for purposes of working, resulting in multiple levels of granularity.
  • When a transaction locks a node in the granularity hierarchy tree explicitly, it implicitly locks all the descendants in the SAME mode.
  • Fine Granularity (Lower in Hierarchy): Greater Concurrency, Higher Locking Overhead
  • Coarse Granularity (Higher in Hierarchy): Lesser Concurrency, Lower Locking Overhead

Granular Hierarchy

Intention Lock Modes

  • If a node is locked in an intention mode, explicit locking is done at a lower level of the tree (that is, at a finer granularity).
  • Intention locks are put on all the ancestors of a node before that node is locked explicitly.
  • Thus, a transaction does not need to search the entire tree to determine whether it can lock a node successfully.
  • Intention-Shared ($IS$): Indicates intent to lock explicitly at a lower level of the tree but only with shared locks.
  • Intention-Exclusive ($IX$): Indicates intent to lock explicitly at a lower level of the tree with exclusive or shared locks.
  • Shared and Intention-Exclusive ($SIX$): Locks the subtree rooted at a given node explicitly in the shared mode, and indicates intent to lock explicitly at a lower level with exclusive locks.

Compatibility Matrix

Multiple-Granularity Locking Protocol

  • Ensures Serializability; No Deadlock Freedom
  • Locks must be acquired in top-down (root-to-leaf) order.
  • Locks must be released in bottom-up (leaf-to-root) order.
Rules
  1. Transaction $T_i$ must observe the lock-compatibility matrix.
  2. Transaction $T_i$ must lock the root of tree first and can lock it in any mode.
  3. Transaction $T_i$ can lock a node $Q$ in $S$ or $IS$ mode only if $T_i$ currently has the parent of $Q$ locked in either $IX$ or $IS$ mode.
  4. Transaction $T_i$ can lock a node $Q$ in $X$, $SIX$, or $IX$ mode only if $T_i$ currently has the parent locked in either $IX$ or $SIX$ mode.
  5. Transaction $T_i$ can lock a node only if $T_i$ has not previously unlocked any node (that is, $T_i$ is two phase).
  6. Transaction $T_i$ can unlock a node $Q$ only if $T_i$ currently has none of the children $Q$ locked.

15.4 Timestamp-Based Protocols

  • A timestamp-ordering scheme ensures conflict serializability by selecting an ordering in advance between every pair of transactions.
  • The timestamps of the transactions determine the serializability order.

Timestamps

  • $W$-timestamp($Q$): Largest timestamp of any transaction that executed $\text{write}(Q)$ successfully.
  • $R$-timestamp($Q$): Largest timestamp of any transaction that executed $\text{read}(Q)$ successfully.

Timestamp Ordering Protocol

  • Suppose that transaction $T_i$ issues $\text{read}(Q)$.
    1. If $\text{TS}(T_i) < W\text{-timestamp}(Q)$, then $T_i$ is attempting to read a value of $Q$ that has already been overwritten.
      1. Reject the $\text{read}(Q)$ operation.
      2. Rollback $T_i$.
    2. If $\text{TS}(T_i) \geq W\text{-timestamp}(Q)$, then $T_i$ is attempting to read the latest value of $Q$.
      1. Return the value of $Q$.
      2. Set $R\text{-timestamp}(Q)$ to $\max(R\text{-timestamp}(Q), \text{TS}(T_i))$.
  • Suppose that transaction $T_i$ issues $\text{write}(Q)$.
    1. If $\text{TS}(T_i) < R\text{-timestamp}(Q)$, then the value of $Q$ that $T_i$ is producing was needed previously, and the system assumed that the value would never be produced.
      1. Reject the $\text{write}(Q)$ operation.
      2. Rollback $T_i$.
    2. If $\text{TS}(T_i) < W\text{-timestamp}(Q)$, then $T_i$ is attempting to write an obsolete value of $Q$.
      1. Reject the $\text{write}(Q)$ operation.
      2. Rollback $T_i$.
    3. Else,
      1. Update the value of $Q$.
      2. Set $W\text{-timestamp}(Q)$ to $\text{TS}(T_i)$.

Thomas' Write Rule

  • Allows some view-serializable schedules that are not conflict serializable to improve concurrency.
  • Suppose that transaction $T_i$ issues $\text{write}(Q)$.
    1. ...Unchanged...
    2. If $\text{TS}(T_i) < W\text{-timestamp}(Q)$, then $T_i$ is attempting to write an obsolete value of $Q$.
      1. Ignore the $\text{write}(Q)$ operation.
    3. ...Unchanged...

15.6 Multiversion Scheme

  • A multiversion concurrency-control scheme is based on the creation of a new version of a data item for each transaction that writes that item.
  • When a read operation is issued, the system selects one of the versions to be read.
  • The concurrency-control scheme ensures that the version to be read is selected in a manner that ensures serializability, by using timestamps. A read operation always succeeds.
    • In multiversion timestamp ordering, a write operation may result in the rollback of the transaction.
    • In multiversion two-phase locking, write operations may result in a lock wait or, possibly, in deadlock.

Chapter 16: Recovery System (TOC)

16.1 Failure Classification

  • Logical Error: Transaction can no longer continue with its normal execution because of some internal condition, such as bad input, data not found, overflow, or resource limit exceeded.
  • System Error: The system has entered an undesirable state, as a result of which a transaction cannot continue with its normal execution. The transaction, however, can be reexecuted at a later time.
  • System Crash: There is a hardware or a software malfunction that brings transaction processing to a halt.
    • Fail-Stop Assumption: When a system halts, it does not corrupt the nonvolatile storage contents.
  • Disk Failure: A disk block loses its content as a result of either a head crash or failure during a data-transfer operation.

16.2 Storage

  • Volatile Storage: Lose Data @ Crash
  • Nonvolatile Storage: Keep Data @ Crash; Lose Data @ Disk Failure
  • Stable Storage: Never Lose Data

Data Access

  • Physical Blocks = Disk Blocks
  • Buffer Blocks = Memory Blocks
  • Each transaction $T_i$ has a private work area in which it keeps local copies of all data items it accesses and updates.
    • $\text{read}(X)$ is always executed before accessing for the first time.
    • $\text{write}(X)$ may be executed at any time before the transaction commits.
Transfer between Disk and Main Memory
  1. $\text{input}(B)$: Physical Block $B$ to Main Memory
  2. $\text{output}(B)$: Buffer Block $B$ to Disk

16.3 Recovery and Atomicity

Log Records

  • A log is a sequence of log records, recording all the update activities in the database.
  • Transaction Start Record: < $T_i$ start >
  • Transaction Update Record: < $T_i$, $X$, $V_1$, $V_2$ >
  • Transaction Compensation Record: < $T_i$, $X$, $V_1$ >
  • Transaction Commit Record: < $T_i$ commit >
  • Transaction Abort Record: < $T_i$ abort >

Database Modification

  • Immediate-Modification Scheme: Allow updates of an uncommitted transaction to be made to the buffer or the disk before the transaction commits.
    • Write-Ahead Logging Rule: An Update Record must be outputted to stable storage before the database item is written to disk.
  • Deferred-Modification Scheme: Writes to buffer or the disk only at the time of transaction commit.
    • Advantage: Simple Recovery
    • Disadvantage: Overhead of Work Area

Undo and Redo Transactions

  • Undo: Using a log record, sets the data item specified in the log record to the old value.
    • Undo transaction $T_i$ by going backwards from the last log record of $T_i$ and appending a Compensation Record with a final Abort Record.
    • Recovery Required: If the log contains a Start Record but no Commit Record or Abort Record.
  • Redo: Using a log record, sets the data item specified in the log record to the new value.
    • Redo transaction $T_i$ by going forward from the first log record of $T_i$.
    • Recovery Required: If the log contains a Start Record and a Commit Record or an Abort Record.

Repeating History

  • Modern recovery algorithms are based on the concept of repeating history, whereby all actions taken during normal operation (since the last completed checkpoint) are replayed during the redo pass of recovery.
  • Repeating history restores the system state to what it was at the time the last log record was output to stable storage before the system crashed.
  • Undo is then performed from this state, by executing an undo pass that processes log records of incomplete transactions in reverse order.

Checkpoints

  • To reduce the overhead of searching the log and redoing transactions, we can use checkpointing techniques.
  • Transactions are not allowed to perform any update actions while a checkpoint is in progress.
  • The $\text{redo}$ or $\text{undo}$ operations need to be applied only to transactions in $L$, and to all transactions that started execution after the < checkpoint $L$ > record was written to the log.
Procedure
  1. Output onto stable storage all records currently residing in main memory.
  2. Output to the disk all modified buffer blocks.
  3. Output onto stable storage a log record of the form < checkpoint $L$ >, where $L$ is a list of transactions active at the time of the checkpoint.

Fuzzy Checkpoint

  • A fuzzy checkpoint is a checkpoint where transactions are allowed to perform updates even while buffer blocks are being written out.
Procedure
  1. Stop all updates by transactions.
  2. Write a < checkpoint $L$ > record.
  3. Flush the log to stable storage.
  4. Resume updates by transactions.
  5. Output to disk all modified buffer blocks.
    • Buffer blocks should not be updated while being outputted.
    • WAL Rule: All log records pertaining to a block must be output before the block is outputted.
  6. Store a pointer to the < checkpoint $L$ > record in a fixed position last_checkpoint on disk.

16.4 Recovery Algorithm

Example of Logged Actions During Recovery

16.5 Buffer Management

  • Efficient implementation of a recovery scheme requires that the number of writes to the database and to stable storage be minimized.
  • Log records may be kept in volatile log buffer initially, but must be written to stable storage when one of the following conditions occurs:
    • Before the < $T_i$ commit > log record may be output to stable storage, all log records pertaining to transaction Ti must have been output to stable storage.
    • Before a block of data in main memory is output to the database (in nonvolatile storage), all log records pertaining to data in that block must have been output to stable storage

16.8 ARIES (Not Mandatory)

Major Features

  1. Uses a log sequence number (LSN) to identify log records, and stores LSNs in database pages to identify which operations have been applied to a database page.
  2. Support physiological redo operations, which are physical in that the affected page is physically identified,but can be logical within the page.
  3. Uses a dirty page table to minimize unnecessary redos during recovery.
    • Dirty pages are those that have been updated inmemory, and the disk version is not up-to-date.
  4. Uses a fuzzy-checkpointing scheme that records only information about dirty pages and associated information and does not even require writing of dirty pages to disk. It flushes dirty pages in the background, continuously, instead of writing them during checkpoints.

Reference

Mohan, C., et al. "ARIES: a transaction recovery method supporting fine-granularity locking and partial rollbacks using write-ahead logging." ACM Transactions on Database Systems (TODS) 17.1 (1992): 94-162.

Chapter 20: Data Warehousing and Mining (TOC)

20.1 Decision-Support Systems

  • Decision-support systems analyze online data collected by transaction-processing systems, to help people make business decisions.
    • Data Analysis
    • Statistical Analysis
    • Data Warehouses
    • Data Mining

20.2 Data Warehousing

  • A data warehouse is a repository (or archive) of information gathered from multiple sources, stored under a unified schema, at a single site.
    • Advantage:
      1. Simple Consolidated Interface
      2. Access to Historical Data
      3. OLTP Not Affected by Analytics
  • ETL: Extract, Transform, Load

Components of a Data Warehouse

Data-Warehouse Architecture

  1. When and How to Gather Data:
    • Source-Driven Architecture: Data sources transmit new information, either continually or periodically.
    • Destination-Driven Architecture: Data warehouse periodically sends requests for new data to the sources.
    • Observation: Data Warehouses = Slightly Out-of-Date Data
  2. What Schema to Use:
    • Schema Integration: Convert data to the integrated schema before storage.
  3. Data Transformation and Cleansing:
    • Data Cleansing: Task of correcting and preprocessing data.
    • Fuzzy Lookup: Approximate matching of incorrect data.
    • Merge-Purge Operation: Deduplication
    • Householding: Grouping
  4. How to Propagate Updates:
    • A schema can be a materialized view of schema from various data sources, so the data warehouse is always partially updated.
  5. What Data to Summarize:
    • If raw data sets are too large to store online, aggregate values are often sufficient to query.

Warehouse Schemas

Star Schema

  • Fact Tables: Tables that contain multidimensional data, with dimension attributes and measure attributes.
  • Dimension Tables: Tables that are referenced by dimension attributes.
  • Star Schema: A fact table with multiple dimension tables and foreign keys from the fact table to the dimension tables.
  • Snowflake Schema: A multi-leveled star schema.

20.3 Data Mining

  • Data Mining: The process of semiautomatically analyzing large databases to find useful patterns.
  • Rules: Represents Knowledge $\rightarrow$ Predict Outcome
  • Prediction: Based on History
    • Classification: Given an unknown item, predict to which class it belongs.
    • Regression: Given a set of mappings for an unknown function, predict the function result for a new parameter value.
  • Associations: Find Similarities $\rightarrow$ Clusters

20.4 Classification

  • Classification: Given that items belong to one of several classes, and given past training instances of items along with the classes to which they belong, the problem is to predict the class to which a new item belongs.
    • Find Rules $\rightarrow$ Partition Data $\rightarrow$ Disjoint Groups

Decision-Tree

Decision-Tree

  • Decision-Tree: A tree in which each leaf node has an associated class, and each internal node has a predicate associated with it.
  • Overfitting: A decision-tree is overfitted if it has been so highly tuned to the specifics of the training data that it makes classification errors on other data.
Classifying
  1. Start at the root and traverse the tree to reach a leaf.
  2. At an internal node, evaluate the predicate on the data, to decide which child to traverse.
  3. Repeat Step 2 until a leaf node is reached.
Construction

Recursive Construction of Decision Tree

Best Splits for Decision Trees

  • Impure: When a training set contains instances from many classes.
  • Gini Measure: Suppose there are $k$ classes, and of the instances in training set $S$ the fraction of instances in class $i$ is $p_i$, then the purity of $S$ is defined as the following. $$\text{Impurity}(S) = \text{Gini}(S) = 1 - \sum_{i = 1}^{k} p_i^2$$
    • When all instances are in a single class, the Gini value is 0, while it reaches its maximum of $1 - 1/k$ if each class hast the same number of instances.
  • Entropy Measure: $$\text{Impurity}(S) = \text{Entropy}(S) = - \sum_{i = 1}^{k} p_i \log_{2} p_i$$
  • Classification Error: $$\text{Impurity}(S) = \text{Classification Error}(S) = 1 - \max(p_i)$$
Best Splits and Information
  • When a set $S$ is split into multiple sets $S_i$, the impurity of the resultant set of sets is the following. $$\text{Impurity}(S_1, S_2, ..., S_r) = \sum_{i = 1}^{r} \frac{\lvert S_i \rvert}{\lvert S \rvert} \text{Impurity}(S_i)$$
  • Information Gain: Benefit of a Split $$\text{InformationGain}(S, \{S_1, S_2, ..., S_r\}) = \text{Impurity}(S) - \text{Impurity}(S_1, S_2, ..., S_r)$$
  • Information Content: Cost of a Split $$\text{InformationContent}(S, \{S_1, S_2, ..., S_r\}) = - \sum_{i = 1}^{r} \frac{\lvert S_i \rvert}{\lvert S \rvert} \log_{2} \frac{\lvert S_i \rvert}{\lvert S \rvert}$$
  • Information Gain Ratio: Maximize for Best Split $$\text{InformationGainRatio} = \frac{\text{InformationGain}(S, \{S_1, S_2, ..., S_r\})}{\text{InformationContent}(S, \{S_1, S_2, ..., S_r\})}$$

Finding Best Splits

Continuous Valued Attributes
  • Binary Split:
    1. Sort training set.
    2. Consider each value as the split point.
    3. Choose the best split.
  • Multi-Way Split:
    • A series of binary splits on the same attribute has roughly the equivalent effect.
Categorical Valued Attributes
  • Binary Split:
    1. Consider each value as the split point.
    2. Choose the best split.
  • Multi-Way Split:
    • A child for each value of the attribute.

Other Types of Classifiers

  • Bayesian Classifiers: Find the distribution of attribute values for each class in the training data.
    • When given a new instance $d$, they use the distribution information to estimate, for each class $c_j$, the probability that instance $d$ belongs to class $c_j$, denoted by $p(c_j \vert d)$.
    • The class with the maximum probability becomes the predicted class for instance $d$. $$p(c_j \vert d) = \frac{p(d \vert c_j)p(c_j)}{p(d)}$$
  • Naive Bayesian Classifiers: Assume attributes have independent distributions. $$p(d \vert c_j) = p(d_1 \vert c_j) * p(d_2 \vert c_j) * ... * p(d_n \vert c_j)$$
    • $p(d_i \vert c_j)$ can be estimated from a histogram on $d_i$ values for each class $c_j$ computed from the training set.

Regression

  • Regression: Predict Values $$Y = a_0 + a_1 * X_1 + a_2 * X_2 + ... + a_n * X_n$$

Validating a Classifier

Binary Outcomes
  • True Positive: Positive Prediction, Positive Test
  • True Negative: Negative Prediction, Negative Test
  • False Positive: Positive Prediction, Negative Test
  • False Negative: Negative Prediction, Positive Test
Measures of Quality
$$pos = t_{pos} + f_{neg}$$$$neg = t_{neg} + f_{pos}$$
  • Accuracy: The fraction of time when the classifier gives the correct classification. $$\frac{(t_{pos} +t_{neg})}{(pos + neg)}$$
  • Recall: How many of the actual positives are classified as positive. $$\frac{t_{pos}}{pos}$$
  • Precision: How often the positive prediction is correct. $$\frac{t_{pos}}{(t_{pos} + f_{pos})}$$
  • Specificity: How many of the actual negative cases are classifed as negative. $$\frac{t_{neg}}{neg}$$

20.5 Association Rules

$$\text{Antecedent} \implies \text{Consequent}$$
  • An association rule must have an associated population; the population consists of a set of instances.
    • Example: $bread \implies milk$
      • People who buy bread tend to buy milk.
  • Support: A measure of what fraction of the population satisfies both the antecedent and the consequent of the rule.
  • Confidence: A measure of how often the consequent is true when the antecedent is true.

Discovering Association Rules

$$i_1, i_2, ..., i_n \implies i_0$$
  1. Find the set of items with sufficient support, called large itemsets ($\ge 2\%$).
  2. For each large itemset $S$, output a rule $S - s \implies s$ for every subset $s \subset S$, provided $S - s \implies s$ has sufficient confidence.
    • $\text{Confidence} = \frac{\text{Support}(S)}{\text{Support}(S - s)}$

Discovering Large Itemsets

  • Sufficient Memory: Single Pass
  • Insufficient Memory: Multiple Passes
  • Optimization: Once an itemset is eliminated because its support is too low, none of its supersets needs to be considered.
A Priori Technique
  1. Pass 1: Calculate the support of all sets with just 1 item. Eliminate the sets with low support.
  2. Pass $i$: For every set of $i$ items, check that all their $i - 1$ subsets are large.

20.6 Other Types of Associations

  • Correlations look for deviations from expected levels of association.

20.7 Clustering

  • Clustering: Group Similar Points $\rightarrow$ Single Set
  • Hierarchical Clustering:
    • Agglomerative Clustering: Bottom-Up with Small Clusters
    • Divisive Clustering: Top-Down with Large Clusters