Section 2: Dictionaries

Imagine if Excel let you look up data by name instead of cell coordinates. Instead of remembering that customer data is in column C, you could just ask for “customer_name”. Dictionaries are exactly that - a way to store and retrieve data using meaningful labels. They’re like having a smart filing system where you can instantly find any piece of information without hunting through rows and columns.

Introduction

Dictionaries store key-value pairs, ideal for representing structured data like database records. They are important for data science because they let you organize data by meaningful labels rather than numeric positions, making your code more readable and maintainable.

Creating and Accessing Dictionaries

Dictionaries use curly braces and store data as key-value pairs.

Basic Dictionary Operations

# Creating dictionaries
customer = {
    "name": "Alice Johnson",
    "email": "alice@email.com",
    "age": 28,
    "premium": True,
    "orders": 15,
    "total_spent": 2847.50
}

# Accessing values by key
print(customer["name"])           # "Alice Johnson"
print(customer["total_spent"])    # 2847.5

# Safe access with get() method
phone = customer.get("phone", "Not provided")
print(phone)                      # "Not provided"

# Dictionary information
print(f"Keys: {list(customer.keys())}")
print(f"Values: {list(customer.values())}")

# Check if key exists
if "email" in customer:
    print("Email found!")

Dictionary Creation Methods

# Method 1: Direct creation
product = {
    "name": "Laptop",
    "price": 999.99,
    "category": "Electronics"
}

# Method 2: Using dict() constructor
product = dict(name="Laptop", price=999.99, category="Electronics")

# Method 3: From lists of tuples
keys = ["name", "price", "category"]
values = ["Laptop", 999.99, "Electronics"]
product = dict(zip(keys, values))

# Method 4: Empty dictionary
empty_dict = {}
empty_dict["new_key"] = "new_value"

Modifying Dictionaries

Dictionaries are mutable, so you can change their contents after creation.

Adding and Updating Values

# Start with basic product info
product = {
    "name": "Laptop",
    "price": 999.99,
    "category": "Electronics"
}

# Adding new key-value pairs
product["brand"] = "TechCorp"
product["in_stock"] = True
product["reviews"] = [4.5, 4.8, 4.2, 4.9]

# Updating existing values
product["price"] = 899.99  # Price drop
product["category"] = "Computers"  # More specific category

# Using update() method for multiple changes
product.update({
    "warranty": "2 years",
    "shipping": "free",
    "rating": 4.6
})

print(f"Updated product: {product}")

Removing Dictionary Items

# Remove specific key-value pair
del product["warranty"]

# Remove and return value
shipping = product.pop("shipping", "standard")
print(f"Shipping method: {shipping}")

# Remove and return last item (Python 3.7+)
last_item = product.popitem()
print(f"Last item: {last_item}")

# Clear all items
product.clear()
print(f"Empty product: {product}")

Dictionary Methods

Python provides many methods for working with dictionaries.

Key Dictionary Methods

# Sample customer data
customer = {
    "name": "Bob Smith",
    "email": "bob@email.com",
    "age": 35,
    "orders": [120, 85, 200, 150],
    "premium": False
}

# Get all keys
print(f"Keys: {list(customer.keys())}")

# Get all values
print(f"Values: {list(customer.values())}")

# Get key-value pairs as tuples
print(f"Items: {list(customer.items())}")

# Check if key exists
print(f"Has email: {'email' in customer}")
print(f"Has phone: {'phone' in customer}")

# Get value with default
phone = customer.get("phone", "No phone on file")
print(f"Phone: {phone}")

# Set default value if key doesn't exist
customer.setdefault("region", "Unknown")
print(f"Region: {customer['region']}")

Dictionary Iteration

# Iterate over keys
for key in customer.keys():
    print(f"Key: {key}")

# Iterate over values
for value in customer.values():
    print(f"Value: {value}")

# Iterate over key-value pairs
for key, value in customer.items():
    print(f"{key}: {value}")

# Iterate with index
for i, (key, value) in enumerate(customer.items()):
    print(f"{i+1}. {key}: {value}")

Advanced Dictionary Operations

Dictionaries are important for data science tasks like data modeling and analysis.

Nested Dictionaries

# Nested dictionary for complex data
company_data = {
    "company": "DataCorp",
    "employees": {
        "alice": {
            "name": "Alice Johnson",
            "department": "Analytics",
            "salary": 75000,
            "skills": ["Python", "SQL", "Statistics"]
        },
        "bob": {
            "name": "Bob Smith",
            "department": "Engineering",
            "salary": 85000,
            "skills": ["Python", "JavaScript", "Docker"]
        },
        "carol": {
            "name": "Carol Davis",
            "department": "Analytics",
            "salary": 72000,
            "skills": ["R", "Python", "Machine Learning"]
        }
    },
    "departments": {
        "Analytics": ["alice", "carol"],
        "Engineering": ["bob"]
    }
}

# Access nested data
alice_salary = company_data["employees"]["alice"]["salary"]
print(f"Alice's salary: ${alice_salary:,}")

# Get all analytics employees
analytics_team = company_data["departments"]["Analytics"]
print(f"Analytics team: {analytics_team}")

# Calculate average salary by department
dept_salaries = {}
for emp_id, emp_data in company_data["employees"].items():
    dept = emp_data["department"]
    salary = emp_data["salary"]
    
    if dept not in dept_salaries:
        dept_salaries[dept] = []
    dept_salaries[dept].append(salary)

for dept, salaries in dept_salaries.items():
    avg_salary = sum(salaries) / len(salaries)
    print(f"{dept} average salary: ${avg_salary:,.0f}")

Dictionary Comprehensions

# Dictionary comprehension for data transformation
sales_data = {
    "Q1": 45000,
    "Q2": 52000,
    "Q3": 48000,
    "Q4": 61000
}

# Create new dictionary with growth rates
previous_year = {
    "Q1": 40000,
    "Q2": 45000,
    "Q3": 42000,
    "Q4": 55000
}

growth_rates = {
    quarter: ((current - previous_year[quarter]) / previous_year[quarter]) * 100
    for quarter, current in sales_data.items()
}

print("Quarterly Growth Rates:")
for quarter, growth in growth_rates.items():
    print(f"{quarter}: {growth:+.1f}%")

# Filter dictionary based on conditions
high_growth = {
    quarter: growth for quarter, growth in growth_rates.items()
    if growth > 10
}
print(f"High growth quarters: {high_growth}")

Data Analysis with Dictionaries

# Customer segmentation analysis
customers = [
    {"name": "Alice", "spending": 2500, "region": "North", "orders": 8},
    {"name": "Bob", "spending": 800, "region": "South", "orders": 3},
    {"name": "Carol", "spending": 1500, "region": "North", "orders": 5},
    {"name": "David", "spending": 3200, "region": "East", "orders": 12},
    {"name": "Eve", "spending": 900, "region": "West", "orders": 2}
]

# Segment customers by spending
segments = {
    "high_value": [],
    "medium_value": [],
    "low_value": []
}

for customer in customers:
    if customer["spending"] >= 2000:
        segments["high_value"].append(customer)
    elif customer["spending"] >= 1000:
        segments["medium_value"].append(customer)
    else:
        segments["low_value"].append(customer)

# Analyze segments
print("Customer Segmentation Analysis:")
for segment, customers in segments.items():
    if customers:
        avg_spending = sum(c["spending"] for c in customers) / len(customers)
        avg_orders = sum(c["orders"] for c in customers) / len(customers)
        print(f"{segment.title()}: {len(customers)} customers")
        print(f"  Average spending: ${avg_spending:,.0f}")
        print(f"  Average orders: {avg_orders:.1f}")

# Regional analysis
regional_totals = {}
for customer in customers:
    region = customer["region"]
    spending = customer["spending"]
    regional_totals[region] = regional_totals.get(region, 0) + spending

print(f"\nRegional Spending Totals:")
for region, total in regional_totals.items():
    print(f"{region}: ${total:,}")

Practice Exercise

Create a comprehensive employee management system using dictionaries:

# Employee management system
employees = {
    "EMP001": {
        "name": "Alice Johnson",
        "department": "Analytics",
        "salary": 75000,
        "hire_date": "2022-03-15",
        "performance": 4.2
    },
    "EMP002": {
        "name": "Bob Smith",
        "department": "Engineering",
        "salary": 85000,
        "hire_date": "2021-08-22",
        "performance": 4.5
    },
    "EMP003": {
        "name": "Carol Davis",
        "department": "Analytics",
        "salary": 72000,
        "hire_date": "2023-01-10",
        "performance": 4.0
    },
    "EMP004": {
        "name": "David Wilson",
        "department": "Engineering",
        "salary": 90000,
        "hire_date": "2020-11-05",
        "performance": 4.8
    }
}

def analyze_employees(emp_dict):
    """Analyze employee data and return summary statistics."""
    total_employees = len(emp_dict)
    
    # Department analysis
    departments = {}
    for emp_id, emp_data in emp_dict.items():
        dept = emp_data["department"]
        if dept not in departments:
            departments[dept] = {"count": 0, "total_salary": 0, "avg_performance": 0}
        
        departments[dept]["count"] += 1
        departments[dept]["total_salary"] += emp_data["salary"]
        departments[dept]["avg_performance"] += emp_data["performance"]
    
    # Calculate averages
    for dept_data in departments.values():
        dept_data["avg_salary"] = dept_data["total_salary"] / dept_data["count"]
        dept_data["avg_performance"] = dept_data["avg_performance"] / dept_data["count"]
    
    # Overall statistics
    all_salaries = [emp["salary"] for emp in emp_dict.values()]
    all_performance = [emp["performance"] for emp in emp_dict.values()]
    
    return {
        "total_employees": total_employees,
        "departments": departments,
        "overall_avg_salary": sum(all_salaries) / len(all_salaries),
        "overall_avg_performance": sum(all_performance) / len(all_performance),
        "highest_paid": max(emp_dict.items(), key=lambda x: x[1]["salary"]),
        "top_performer": max(emp_dict.items(), key=lambda x: x[1]["performance"])
    }

def generate_employee_report(emp_dict):
    """Generate a comprehensive employee report."""
    analysis = analyze_employees(emp_dict)
    
    print("Employee Analysis Report")
    print("=" * 50)
    print(f"Total Employees: {analysis['total_employees']}")
    print(f"Overall Average Salary: ${analysis['overall_avg_salary']:,.0f}")
    print(f"Overall Average Performance: {analysis['overall_avg_performance']:.1f}")
    
    print(f"\nHighest Paid Employee:")
    emp_id, emp_data = analysis['highest_paid']
    print(f"  {emp_data['name']} (${emp_data['salary']:,})")
    
    print(f"\nTop Performer:")
    emp_id, emp_data = analysis['top_performer']
    print(f"  {emp_data['name']} (Rating: {emp_data['performance']})")
    
    print(f"\nDepartment Breakdown:")
    for dept, dept_data in analysis['departments'].items():
        print(f"  {dept}:")
        print(f"    Employees: {dept_data['count']}")
        print(f"    Average Salary: ${dept_data['avg_salary']:,.0f}")
        print(f"    Average Performance: {dept_data['avg_performance']:.1f}")

# Run the analysis
generate_employee_report(employees)

Assets

Resources

Python dictionary tutorial: https://docs.python.org/3/tutorial/datastructures.html#dictionaries
Dictionary methods reference: https://docs.python.org/3/library/stdtypes.html#dict
Data modeling with dictionaries: https://realpython.com/python-dicts/

Summary

Dictionaries store key-value pairs and are ideal for representing structured data. They provide flexible access to data by meaningful labels and support efficient operations for data analysis. Key concepts include creation, modification, iteration, and advanced techniques like nested dictionaries and comprehensions.