RELENG_5 to delphijfork patch $Phantasm: delphijweb/research/freebsd/delphijfork.diff,v 1.12 2004/10/04 17:38:52 delphij Exp $ Index: src/lib/libc/stdlib/malloc.c diff -u src/lib/libc/stdlib/malloc.c:1.89.2.1 src/lib/libc/stdlib/malloc.c:1.87.1000.3 --- src/lib/libc/stdlib/malloc.c:1.89.2.1 Tue Sep 7 23:38:34 2004 +++ src/lib/libc/stdlib/malloc.c Mon Jul 5 22:08:17 2004 @@ -221,7 +221,7 @@ static struct pgfree free_list; /* Abort(), user doesn't handle problems. */ -static int malloc_abort; +static int malloc_abort = 0; /* Are we trying to die ? */ static int suicide; @@ -244,7 +244,7 @@ static int malloc_zero; /* junk fill ? */ -static int malloc_junk; +static int malloc_junk = 0; #ifdef HAS_UTRACE Index: src/sys/conf/newvers.sh diff -u src/sys/conf/newvers.sh:1.62.2.15 src/sys/conf/newvers.sh:1.61.1000.5 --- src/sys/conf/newvers.sh:1.62.2.15 Sat Oct 2 22:17:44 2004 +++ src/sys/conf/newvers.sh Thu Aug 19 20:12:17 2004 @@ -32,7 +32,7 @@ TYPE="FreeBSD" REVISION="5.3" -BRANCH="BETA7" +BRANCH="delphij" RELEASE="${REVISION}-${BRANCH}" VERSION="${TYPE} ${RELEASE}" @@ -82,7 +82,7 @@ touch version v=`cat version` u=${USER-root} d=`pwd` h=${HOSTNAME-`hostname`} t=`date` -i=`${MAKE:-make} -V KERN_IDENT` +i=`make -V KERN_IDENT` cat << EOF > vers.c $COPYRIGHT char sccspad[32 - 4 /* sizeof(sccs) */] = { '\\0' }; Index: src/sys/dev/ata/atapi-cd.c diff -u src/sys/dev/ata/atapi-cd.c:1.170.2.1 src/sys/dev/ata/atapi-cd.c:1.170.1000.2 --- src/sys/dev/ata/atapi-cd.c:1.170.2.1 Tue Sep 7 03:36:26 2004 +++ src/sys/dev/ata/atapi-cd.c Tue Sep 7 15:42:33 2004 @@ -117,6 +117,7 @@ } ata_set_name(atadev, "acd", cdp->lun); + ata_controlcmd(atadev, ATA_ATAPI_RESET, 0, 0, 0); acd_get_cap(cdp); /* if this is a changer device, allocate the neeeded lun's */ Index: src/sys/i386/conf/GENERIC diff -u src/sys/i386/conf/GENERIC:1.413.2.6 src/sys/i386/conf/GENERIC:1.405.1000.10 --- src/sys/i386/conf/GENERIC:1.413.2.6 Thu Sep 23 03:23:37 2004 +++ src/sys/i386/conf/GENERIC Sun Sep 26 15:56:37 2004 @@ -19,8 +19,8 @@ # $FreeBSD$ machine i386 -cpu I486_CPU -cpu I586_CPU +#cpu I486_CPU +#cpu I586_CPU cpu I686_CPU ident GENERIC @@ -75,8 +75,8 @@ device atadisk # ATA disk drives device ataraid # ATA RAID drives device atapicd # ATAPI CDROM drives -device atapifd # ATAPI floppy drives -device atapist # ATAPI tape drives +#device atapifd # ATAPI floppy drives +#device atapist # ATAPI tape drives options ATA_STATIC_ID # Static device numbering # SCSI Controllers @@ -271,3 +271,12 @@ device firewire # FireWire bus code device sbp # SCSI over FireWire (Requires scbus and da) device fwe # Ethernet over FireWire (non-standard!) + +# delphij's usual hacks +makeoptions CONF_CFLAGS=-fno-builtin +options INCLUDE_CONFIG_FILE +options ACCEPT_FILTER_DATA +options ACCEPT_FILTER_HTTP +options ZERO_COPY_SOCKETS +options AUTO_EOI_1 + Index: src/sys/kern/init_main.c diff -u src/sys/kern/init_main.c:1.246.2.2 src/sys/kern/init_main.c:1.242.1000.6 --- src/sys/kern/init_main.c:1.246.2.2 Thu Sep 9 18:03:19 2004 +++ src/sys/kern/init_main.c Mon Sep 13 10:40:03 2004 @@ -87,7 +87,7 @@ /* Components of the first process -- never freed. */ static struct session session0; -static struct pgrp pgrp0; +struct pgrp pgrp0; struct proc proc0; struct thread thread0; struct ksegrp ksegrp0; @@ -355,10 +355,8 @@ * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); - LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; - LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); Index: src/sys/kern/kern_exit.c diff -u src/sys/kern/kern_exit.c:1.245.2.1 src/sys/kern/kern_exit.c:1.229.1000.18 --- src/sys/kern/kern_exit.c:1.245.2.1 Thu Sep 9 18:03:19 2004 +++ src/sys/kern/kern_exit.c Mon Sep 13 10:40:03 2004 @@ -75,7 +75,6 @@ #include #include #include -#include #include /* Required to be non-static for SysVR4 emulator */ @@ -386,7 +385,6 @@ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); - LIST_REMOVE(p, p_hash); sx_xunlock(&allproc_lock); sx_xlock(&proctree_lock); @@ -686,10 +684,7 @@ #endif KASSERT(FIRST_THREAD_IN_PROC(p), ("kern_wait: no residual thread!")); - uma_zfree(proc_zone, p); - sx_xlock(&allproc_lock); - nprocs--; - sx_xunlock(&allproc_lock); + proc_free(p); return (0); } mtx_lock_spin(&sched_lock); Index: src/sys/kern/kern_fork.c diff -u src/sys/kern/kern_fork.c:1.234.2.4 src/sys/kern/kern_fork.c:1.226.1000.13 --- src/sys/kern/kern_fork.c:1.234.2.4 Sat Sep 18 12:11:35 2004 +++ src/sys/kern/kern_fork.c Mon Sep 20 10:13:32 2004 @@ -69,7 +69,6 @@ #include #include #include -#include #include #include @@ -145,48 +144,6 @@ return (error); } -int nprocs = 1; /* process 0 */ -int lastpid = 0; -SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, - "Last used PID"); - -/* - * Random component to lastpid generation. We mix in a random factor to make - * it a little harder to predict. We sanity check the modulus value to avoid - * doing it in critical paths. Don't let it be too small or we pointlessly - * waste randomness entropy, and don't let it be impossibly large. Using a - * modulus that is too big causes a LOT more process table scans and slows - * down fork processing as the pidchecked caching is defeated. - */ -static int randompid = 0; - -static int -sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) -{ - int error, pid; - - error = sysctl_wire_old_buffer(req, sizeof(int)); - if (error != 0) - return(error); - sx_xlock(&allproc_lock); - pid = randompid; - error = sysctl_handle_int(oidp, &pid, 0, req); - if (error == 0 && req->newptr != NULL) { - if (pid < 0 || pid > PID_MAX - 100) /* out of range */ - pid = PID_MAX - 100; - else if (pid < 2) /* NOP */ - pid = 0; - else if (pid < 100) /* Make it reasonable */ - pid = 100; - randompid = pid; - } - sx_xunlock(&allproc_lock); - return (error); -} - -SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, - 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); - int fork1(td, flags, pages, procp) struct thread *td; @@ -196,9 +153,7 @@ { struct proc *p1, *p2, *pptr; uid_t uid; - struct proc *newproc; - int ok, trypid; - static int curfail, pidchecked = 0; + static int curfail; static struct timeval lastfail; struct filedesc *fd; struct filedesc_to_leader *fdtol; @@ -282,138 +237,20 @@ } /* Allocate new proc. */ - newproc = uma_zalloc(proc_zone, M_WAITOK); -#ifdef MAC - mac_init_proc(newproc); -#endif - knlist_init(&newproc->p_klist, &newproc->p_mtx); - - /* We have to lock the process tree while we look for a pid. */ sx_slock(&proctree_lock); - /* - * Although process entries are dynamically created, we still keep - * a global limit on the maximum number we will create. Don't allow - * a nonprivileged user to use the last ten processes; don't let root - * exceed the limit. The variable nprocs is the current number of - * processes, maxproc is the limit. - */ - sx_xlock(&allproc_lock); - uid = td->td_ucred->cr_ruid; - if ((nprocs >= maxproc - 10 && - suser_cred(td->td_ucred, SUSER_RUID) != 0) || - nprocs >= maxproc) { + p2 = proc_alloc(td, flags); + if (!p2) { error = EAGAIN; goto fail; } - /* - * Increment the count of procs running with this uid. Don't allow - * a nonprivileged user to exceed their current limit. - */ - PROC_LOCK(p1); - ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, - (uid != 0) ? lim_cur(p1, RLIMIT_NPROC) : 0); - PROC_UNLOCK(p1); - if (!ok) { - error = EAGAIN; - goto fail; - } - - /* - * Increment the nprocs resource before blocking can occur. There - * are hard-limits as to the number of processes that can run. - */ - nprocs++; - - /* - * Find an unused process ID. We remember a range of unused IDs - * ready to use (from lastpid+1 through pidchecked-1). - * - * If RFHIGHPID is set (used during system boot), do not allocate - * low-numbered pids. - */ - trypid = lastpid + 1; - if (flags & RFHIGHPID) { - if (trypid < 10) - trypid = 10; - } else { - if (randompid) - trypid += arc4random() % randompid; - } -retry: - /* - * If the process ID prototype has wrapped around, - * restart somewhat above 0, as the low-numbered procs - * tend to include daemons that don't exit. - */ - if (trypid >= PID_MAX) { - trypid = trypid % PID_MAX; - if (trypid < 100) - trypid += 100; - pidchecked = 0; - } - if (trypid >= pidchecked) { - int doingzomb = 0; - - pidchecked = PID_MAX; - /* - * Scan the active and zombie procs to check whether this pid - * is in use. Remember the lowest pid that's greater - * than trypid, so we can avoid checking for a while. - */ - p2 = LIST_FIRST(&allproc); -again: - for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) { - PROC_LOCK(p2); - while (p2->p_pid == trypid || - (p2->p_pgrp != NULL && - (p2->p_pgrp->pg_id == trypid || - (p2->p_session != NULL && - p2->p_session->s_sid == trypid)))) { - trypid++; - if (trypid >= pidchecked) { - PROC_UNLOCK(p2); - goto retry; - } - } - if (p2->p_pid > trypid && pidchecked > p2->p_pid) - pidchecked = p2->p_pid; - if (p2->p_pgrp != NULL) { - if (p2->p_pgrp->pg_id > trypid && - pidchecked > p2->p_pgrp->pg_id) - pidchecked = p2->p_pgrp->pg_id; - if (p2->p_session != NULL && - p2->p_session->s_sid > trypid && - pidchecked > p2->p_session->s_sid) - pidchecked = p2->p_session->s_sid; - } - PROC_UNLOCK(p2); - } - if (!doingzomb) { - doingzomb = 1; - p2 = LIST_FIRST(&zombproc); - goto again; - } - } +#ifdef MAC + mac_init_proc(p2); +#endif sx_sunlock(&proctree_lock); /* - * RFHIGHPID does not mess with the lastpid counter during boot. - */ - if (flags & RFHIGHPID) - pidchecked = 0; - else - lastpid = trypid; - - p2 = newproc; - p2->p_state = PRS_NEW; /* protect against others */ - p2->p_pid = trypid; - LIST_INSERT_HEAD(&allproc, p2, p_list); - LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); - sx_xunlock(&allproc_lock); - - /* * Malloc things while we don't hold any locks. */ if (flags & RFSIGSHARE) @@ -619,6 +456,7 @@ } mtx_unlock(&ktrace_mtx); #endif + knlist_init(&p2->p_klist, &p2->p_mtx); /* * If PF_FORK is set, the child process inherits the @@ -743,15 +581,11 @@ *procp = p2; return (0); fail: + uid = td->td_ucred->cr_ruid; sx_sunlock(&proctree_lock); if (ppsratecheck(&lastfail, &curfail, 1)) printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n", uid); - sx_xunlock(&allproc_lock); -#ifdef MAC - mac_destroy_proc(newproc); -#endif - uma_zfree(proc_zone, newproc); if (p1->p_flag & P_HADTHREADS) { PROC_LOCK(p1); thread_single_end(); Index: src/sys/kern/kern_proc.c diff -u src/sys/kern/kern_proc.c:1.215.2.1 src/sys/kern/kern_proc.c:1.202.1000.21 --- src/sys/kern/kern_proc.c:1.215.2.1 Thu Sep 9 18:03:19 2004 +++ src/sys/kern/kern_proc.c Mon Sep 13 10:40:03 2004 @@ -37,6 +37,7 @@ #include "opt_kstack_pages.h" #include +#include #include #include #include @@ -80,12 +81,51 @@ static void proc_fini(void *mem, int size); /* + * pid to proc lookup is done by indexing the pid_table array. + * Since pid numbers are only allocated when an empty slot + * has been found, there is no need to search any lists ever. + * (an orphaned pgrp will lock the slot, a session will lock + * the pgrp with the same number). + * + * If the table is too small it is reallocated with twice the + * previous size and the entries 'unzipped' into the two halves. + * A linked list of free entries is passed through the pt_proc + * field of 'free' items - set odd to be an invalid ptr. + */ + +struct pid_table { + struct proc* pt_proc; + struct pgrp* pt_pgrp; +}; + +#if 1 /* strongly typed cast - should be a noop */ +static __inline intptr_t p2u(struct proc *p) { return (intptr_t)p; }; +#else +#define p2u(p) ((intptr_t)p) +#endif + +#define P_VALID(p) (!(p2u(p) & 1)) +#define P_NEXT(p) (p2u(p) >> 1) +#define P_FREE(pid) ((struct proc *)((pid) << 1 | 1)) + +#define INITIAL_PID_TABLE_SIZE (1 << 7) +#define RESERVED_PROCSLOT 10 + +static struct pid_table *pid_table; + +static u_int pid_tbl_mask = (INITIAL_PID_TABLE_SIZE) - 1; /* table size 2^n */ +static u_int pid_alloc_lim; /* max we allocate before growing table */ +static u_int pid_alloc_cnt = 0; + +/* links through free slots - never empty! */ +static u_int next_free_pt, last_free_pt, next_free_pt_highid; +static pid_t pid_max = 1 << 12; /* largest value we alocate */ + +static int randompid = 0; + +/* * Other process lists */ -struct pidhashhead *pidhashtbl; -u_long pidhash; -struct pgrphashhead *pgrphashtbl; -u_long pgrphash; struct proclist allproc; struct proclist zombproc; struct sx allproc_lock; @@ -95,6 +135,9 @@ uma_zone_t proc_zone; uma_zone_t ithread_zone; +int nprocs = 1; /* process 0 */ +int lastpid = 0; + int kstack_pages = KSTACK_PAGES; int uarea_pages = UAREA_PAGES; SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, ""); @@ -103,26 +146,287 @@ CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); /* - * Initialize global process hashing structures. + * Initialize global process mapping structures. */ void procinit() { + int i; sx_init(&allproc_lock, "allproc"); sx_init(&proctree_lock, "proctree"); mtx_init(&pargs_ref_lock, "struct pargs.ref", NULL, MTX_DEF); mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF); + + MALLOC(pid_table, struct pid_table *, + INITIAL_PID_TABLE_SIZE * sizeof *pid_table, M_PROC, M_WAITOK); + +#define LINK_EMPTY ((PID_MAX + INITIAL_PID_TABLE_SIZE) & ~(INITIAL_PID_TABLE_SIZE - 1)) + /* + * Set free list running through table... + * Preset 'use count' to -1 so we allocate pid 1 next. + */ + for (i = 0; i <= pid_tbl_mask; i++) { + pid_table[i].pt_proc = P_FREE(LINK_EMPTY + i + 1); + pid_table[i].pt_pgrp = 0; + } + + /* slot 0 is just grabbed */ + next_free_pt = 1; + next_free_pt_highid = RESERVED_PROCSLOT; + pid_table[0].pt_proc = &proc0; + pid_table[0].pt_pgrp = &pgrp0; + + /* Need to fix fix last entry. */ + last_free_pt = pid_tbl_mask; + pid_table[last_free_pt].pt_proc = P_FREE(LINK_EMPTY); + + /* point at which we grow table - to avoid reusing pids too often */ + pid_alloc_lim = pid_tbl_mask - 1; +#undef LINK_EMPTY + LIST_INIT(&allproc); LIST_INIT(&zombproc); - pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); - pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), proc_ctor, proc_dtor, proc_init, proc_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uihashinit(); } +static void +expand_pid_table(void) +{ + u_int pt_size = pid_tbl_mask + 1; + struct pid_table *n_pt, *new_pt; + struct proc *proc; + struct pgrp *pgrp; + int i; + pid_t pid; + + new_pt = malloc(pt_size * 2 * sizeof *new_pt, M_PROC, M_WAITOK); + + sx_xlock(&allproc_lock); + if (pt_size != pid_tbl_mask + 1) { + /* Another process beat us to it... */ + sx_xunlock(&allproc_lock); + FREE(new_pt, M_PROC); + return; + } + + /* + * Copy entries from old table into new one. + * If 'pid' is 'odd' we need to place in the upper half, + * even pid's to the lower half. + * + * Free items stay in the low half so we don't have to + * fixup the reference to them. + * + * We stuff free items on the front of the freelist + * because we can't write to unmodified entries. + * + * Processing the table backwards maintians a semblance + * of issueing pid numbers that increase with time. + */ + i = pt_size - 1; + n_pt = new_pt + i; + for (; ; i--, n_pt--) { + proc = pid_table[i].pt_proc; + pgrp = pid_table[i].pt_pgrp; + if (!P_VALID(proc)) { + /* Up 'use count' so that link is valid */ + pid = (P_NEXT(proc) + pt_size) & ~pt_size; + proc = P_FREE(pid); + if (pgrp) + pid = pgrp->pg_id; + } else + pid = proc->p_pid; + + /* Save entry in appropriate half of table */ + n_pt[pid & pt_size].pt_proc = proc; + n_pt[pid & pt_size].pt_pgrp = pgrp; + + /* Put other piece on start of free list */ + pid = (pid ^ pt_size) & ~pid_tbl_mask; + n_pt[pid & pt_size].pt_proc = + P_FREE((pid & ~pt_size) | next_free_pt); + n_pt[pid & pt_size].pt_pgrp = 0; + next_free_pt = i | (pid & pt_size); + if (i == 0) + break; + } + + /* Switch tables */ + n_pt = pid_table; + pid_table = new_pt; + pid_tbl_mask = pt_size * 2 - 1; + + /* + * pid_max starts as 1 >> 12 (4096), once we have 2048 + * allocated pids we need it to be larger! + */ + if (pid_tbl_mask > pid_max) { + pid_max = pid_tbl_mask * 2 + 1; + pid_alloc_lim |= pid_alloc_lim << 1; + if (pid_max > PID_MAX) + pid_max = PID_MAX; + } else + pid_alloc_lim <<= 1; /* doubles number of free slots... */ + + sx_xunlock(&allproc_lock); + FREE(n_pt, M_PROC); +} + +/* + * Allocate a free proc structure. This method is called from fork1. + * + * Expand the mapping table when needed. + */ +struct proc * +proc_alloc(struct thread *td, int flags) +{ + struct proc *p, *p1; + int nxt; + pid_t pid; + struct pid_table *pt; + int ok; + uid_t uid; + int highpid; + + p = uma_zalloc(proc_zone, M_WAITOK); + p->p_state = PRS_NEW; /* protect against others */ + + /* allocate next free pid */ + for (;;expand_pid_table()) { + highpid = 0; + if (pid_alloc_cnt >= pid_alloc_lim) + /* ensure pids cycle through 2000+ values */ + continue; + sx_xlock(&allproc_lock); + if ((flags & RFHIGHPID) && next_free_pt < RESERVED_PROCSLOT) { + highpid = 1; + pt = &pid_table[next_free_pt_highid]; + nxt = P_NEXT(pt->pt_proc); + if (nxt & pid_tbl_mask) { + /* Fix the free list link */ + if (!P_VALID(pid_table[RESERVED_PROCSLOT - 1].pt_proc)) + pid_table[RESERVED_PROCSLOT - 1].pt_proc = + P_FREE(P_NEXT(pid_table[RESERVED_PROCSLOT - 1].pt_proc) + | (nxt & pid_tbl_mask)); + /* XXX: Shouldn't we unlock allproc_lock? */ + break; + } + } else { + pt = &pid_table[next_free_pt]; + nxt = P_NEXT(pt->pt_proc); + /* XXX: Shouldn't we unlock allproc_lock at the break? */ + if (nxt & pid_tbl_mask) + break; + } + /* XXX: Shouldn't we unlock allproc_lock before break? */ + if (nxt & pid_tbl_mask) + break; + /* Table full - expand (NB last entry not used....) */ + sx_xunlock(&allproc_lock); + } + + KASSERT((!P_VALID(pt->pt_proc)), ("Proc slot is not free")); + /* pid is 'saved use count' + 'size' + entry */ + pid = (nxt & ~pid_tbl_mask) + pid_tbl_mask + 1 + + (highpid ? next_free_pt_highid : next_free_pt); + + /* + * Handle this now, so that we don't have to grab the allproc lock + * again later in fork1(). + * + * Although process entries are dynamically created, we still keep + * a global limit on the maximum number we will create. Don't allow + * a nonprivileged user to use the last ten processes; don't let root + * exceed the limit. The variable nprocs is the current number of + * processes, maxproc is the limit. + */ + uid = td->td_ucred->cr_ruid; + if ((nprocs >= maxproc - 10 && + suser_cred(td->td_ucred, SUSER_RUID) != 0) || + nprocs >= maxproc) + goto bad; + + /* + * Increment the nprocs resource before blocking can occur. There + * are hard-limits as to the number of processes that can run. + */ + p1 = td->td_proc; + PROC_LOCK(p1); + ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, + (uid != 0) ? lim_cur(p1, RLIMIT_NPROC) : 0); + PROC_UNLOCK(p1); + if (!ok) + goto bad; + + if (randompid) { + pid += (arc4random() % randompid) * (pid_tbl_mask + 1); + } + + if ((u_int)pid > (u_int)pid_max) + pid &= pid_tbl_mask; + + p->p_pid = pid; + if (highpid) + next_free_pt_highid = nxt & pid_tbl_mask; + else + next_free_pt = nxt & pid_tbl_mask; + + /* Grab table slot */ + pt->pt_proc = p; + pid_alloc_cnt++; + + /* + * RFHIGHPID does not mess with the lastpid counter during boot. + */ + if (!(flags & RFHIGHPID)) + lastpid = pid; + + LIST_INSERT_HEAD(&allproc, p, p_list); + nprocs++; + + sx_xunlock(&allproc_lock); + + return p; + +bad: + uma_zfree(proc_zone, p); + sx_xunlock(&allproc_lock); + return (NULL); +} + +/* + * Free last resources of a process - called from kern_wait (in kern_exit.c) + */ +void +proc_free(struct proc *p) +{ + pid_t pid = p->p_pid; + struct pid_table *pt; + + sx_xlock(&allproc_lock); + + pt = &pid_table[pid & pid_tbl_mask]; + /* save pid use count in slot */ + pt->pt_proc = P_FREE(pid & ~pid_tbl_mask); + + if (pt->pt_pgrp == NULL) { + /* link last freed entry onto ours */ + pid &= pid_tbl_mask; + pt = &pid_table[last_free_pt]; + pt->pt_proc = P_FREE(P_NEXT(pt->pt_proc) | pid); + last_free_pt = pid; + pid_alloc_cnt--; + } + + nprocs--; + sx_xunlock(&allproc_lock); + uma_zfree(proc_zone, p); +} + /* * Prepare a proc for use. */ @@ -241,15 +545,33 @@ register struct proc *p; sx_slock(&allproc_lock); - LIST_FOREACH(p, PIDHASH(pid), p_hash) - if (p->p_pid == pid) { - if (p->p_state == PRS_NEW) { - p = NULL; - break; - } - PROC_LOCK(p); - break; - } + p = pid_table[pid & pid_tbl_mask].pt_proc; + /* Only allow live processes to be found by pid. */ + if (!P_VALID(p) || p->p_pid != pid || p->p_state == PRS_NEW) + p = NULL; + else + PROC_LOCK(p); + /* XXX MP - need to have a reference count... */ + sx_sunlock(&allproc_lock); + return (p); +} + +/* + * Locate a zombie process by number + */ +struct proc * +zpfind(register pid_t pid) +{ + register struct proc *p; + + sx_slock(&allproc_lock); + p = pid_table[pid & pid_tbl_mask].pt_proc; + /* Only allow zombie processes to be found by pid. */ + if (!P_VALID(p) || p->p_pid != pid || p->p_state != PRS_ZOMBIE) + p = NULL; + else + PROC_LOCK(p); + /* XXX MP - need to have a reference count... */ sx_sunlock(&allproc_lock); return (p); } @@ -264,15 +586,21 @@ { register struct pgrp *pgrp; - sx_assert(&proctree_lock, SX_LOCKED); + sx_slock(&allproc_lock); + pgrp = pid_table[pgid & pid_tbl_mask].pt_pgrp; - LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { - if (pgrp->pg_id == pgid) { - PGRP_LOCK(pgrp); - return (pgrp); - } - } - return (NULL); + /* + * Can't look up a pgrp that only exists because the session + * hasn't died yet (traditional) + */ + if (pgrp == NULL || pgrp->pg_id != pgid + || LIST_EMPTY(&pgrp->pg_members)) + pgrp = NULL; + else + PGRP_LOCK(pgrp); + /* XXX MP - need to have a reference count... */ + sx_sunlock(&allproc_lock); + return pgrp; } /* @@ -337,7 +665,7 @@ * As we have an exclusive lock of proctree_lock, * this should not deadlock. */ - LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); + pid_table[pgid & pid_tbl_mask].pt_pgrp = pgrp; pgrp->pg_jobc = 0; SLIST_INIT(&pgrp->pg_sigiolst); PGRP_UNLOCK(pgrp); @@ -436,6 +764,31 @@ } /* + * remove the pg from the PIDTable + */ +static void +pgunlink(pid_t pg_id) +{ + struct pgrp *pgrp; + struct pid_table *pt; + + sx_assert(&proctree_lock, SX_XLOCKED); + pt = &pid_table[pg_id & pid_tbl_mask]; + pgrp = pt->pt_pgrp; + pt->pt_pgrp = 0; + + if (!P_VALID(pt->pt_proc)) { + /* orphaned pgrp, put slot onto free list */ + pg_id &= pid_tbl_mask; + pt = &pid_table[last_free_pt]; + pt->pt_proc = P_FREE(P_NEXT(pt->pt_proc) | pg_id); + last_free_pt = pg_id; + pid_alloc_cnt--; + } + +} + +/* * delete a process group */ static void @@ -459,7 +812,6 @@ if (pgrp->pg_session->s_ttyp != NULL && pgrp->pg_session->s_ttyp->t_pgrp == pgrp) pgrp->pg_session->s_ttyp->t_pgrp = NULL; - LIST_REMOVE(pgrp, pg_hash); savesess = pgrp->pg_session; SESS_LOCK(savesess); i = --savesess->s_count; @@ -470,6 +822,11 @@ ttyrel(savesess->s_ttyp); mtx_destroy(&savesess->s_mtx); FREE(savesess, M_SESSION); + pgunlink(pgrp->pg_id); + } + else { + if (savesess->s_sid != pgrp->pg_id) + pgunlink(pgrp->pg_id); } mtx_destroy(&pgrp->pg_mtx); FREE(pgrp, M_PGRP); @@ -493,6 +850,22 @@ } /* + * Delete session - called from SESSRELE when s_count becomes zero. + */ +void +sessdelete(struct session *ss) +{ + /* + * We keep the pgrp with the same id as the session in + * order to stop a process being given the same pid. + * Since the pgrp holds a reference to the session, it + * must be a 'zombie' pgrp by now. + */ + pgunlink(ss->s_sid); + FREE(ss, M_SESSION); +} + +/* * Adjust pgrp jobc counters when specified process changes process group. * We count the number of processes in each process group that "qualify" * the group for terminal job control (those with a parent in a different @@ -581,24 +954,39 @@ DB_SHOW_COMMAND(pgrpdump, pgrpdump) { register struct pgrp *pgrp; + register struct pid_table *pt; register struct proc *p; - register int i; + int id; + int quit = 0; - for (i = 0; i <= pgrphash; i++) { - if (!LIST_EMPTY(&pgrphashtbl[i])) { - printf("\tindx %d\n", i); - LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { - printf( - "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", - (void *)pgrp, (long)pgrp->pg_id, - (void *)pgrp->pg_session, - pgrp->pg_session->s_count, - (void *)LIST_FIRST(&pgrp->pg_members)); - LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { - printf("\t\tpid %ld addr %p pgrp %p\n", - (long)p->p_pid, (void *)p, - (void *)p->p_pgrp); - } + db_setup_paging(db_simple_pager, &quit, DB_LINES_PER_PAGE); + printf("pid table %p size %x, next %x, last %x\n", + pid_table, pid_tbl_mask+1, + next_free_pt, last_free_pt); + for (pt = pid_table, id = 0; id <= pid_tbl_mask && !quit; id++, pt++) { + p = pt->pt_proc; + if (!P_VALID(p) && !pt->pt_pgrp) + continue; + db_printf(" id %x: ", id); + if (P_VALID(p)) + db_printf("proc %p id %d (0x%x) %s\n", + p, p->p_pid, p->p_pid, p->p_comm); + else + db_printf("next %x use %x\n", + P_NEXT(p) & pid_tbl_mask, + P_NEXT(p) & ~pid_tbl_mask); + if ((pgrp = pt->pt_pgrp)) { + db_printf("\tsession %p, sid %d, count %d, login %s\n", + pgrp->pg_session, pgrp->pg_session->s_sid, + pgrp->pg_session->s_count, + pgrp->pg_session->s_login); + db_printf("\tpgrp %p, pg_id %d, pg_jobc %d, members %p\n", + pgrp, pgrp->pg_id, pgrp->pg_jobc, + pgrp->pg_members.lh_first); + for (p = pgrp->pg_members.lh_first; p != 0 && !quit; + p = p->p_pglist.le_next) { + db_printf("\t\tpid %d addr %p pgrp %p %s\n", + p->p_pid, p, p->p_pgrp, p->p_comm); } } } @@ -817,24 +1205,6 @@ kp->ki_ppid = p->p_pptr->p_pid; } -/* - * Locate a zombie process by number - */ -struct proc * -zpfind(pid_t pid) -{ - struct proc *p; - - sx_slock(&allproc_lock); - LIST_FOREACH(p, &zombproc, p_list) - if (p->p_pid == pid) { - PROC_LOCK(p); - break; - } - sx_sunlock(&allproc_lock); - return (p); -} - #define KERN_PROC_ZOMBMASK 0x3 #define KERN_PROC_NOTHREADS 0x4 @@ -1187,6 +1557,7 @@ return (sysctl_handle_string(oidp, sv_name, 0, req)); } +SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); @@ -1252,3 +1623,54 @@ SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td, CTLFLAG_RD, sysctl_kern_proc, "Return process table, no threads"); + +/* + * Random component to lastpid generation. We mix in a random factor to make + * it a little harder to predict. We sanity check the modulus value to avoid + * doing it in critical paths. Don't let it be too small or we pointlessly + * waste randomness entropy, and don't let it be impossibly large. Using a + * modulus that is too big causes a LOT more process table scans and slows + * down fork processing as the pidchecked caching is defeated. + */ + +static int +sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) +{ + int error, pid; + + sysctl_wire_old_buffer(req, sizeof(int)); + pid = randompid; + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error == 0 && req->newptr != NULL) { + if (pid < 0 || pid > 100) /* out of range */ + pid = 100; + else if (pid < 2) /* NOP */ + pid = 0; + randompid = pid; + } + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); + +static int +sysctl_kern_currentmaxpid(SYSCTL_HANDLER_ARGS) +{ + int error, maxpid; + + maxpid = pid_max; + sysctl_wire_old_buffer(req, sizeof(int)); + error = sysctl_handle_int(oidp, &maxpid, 0, req); + if (error == 0 && req->newptr != NULL) { + if (maxpid > pid_tbl_mask * 2 + 1 && maxpid <= PID_MAX) + pid_max = maxpid; + else + error = EINVAL; + } + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, maxpid, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_kern_currentmaxpid, "I", "Current Maxium PID"); + Index: src/sys/netinet/tcp_input.c diff -u src/sys/netinet/tcp_input.c:1.252 src/sys/netinet/tcp_input.c:1.241.1000.19 --- src/sys/netinet/tcp_input.c:1.252 Wed Aug 18 06:05:54 2004 +++ src/sys/netinet/tcp_input.c Thu Aug 19 19:29:15 2004 @@ -1,4 +1,54 @@ /* + * Copyright (c) 2002, 2003, 2004 Jeffrey M. Hsu. All rights reserved. + * Copyright (c) 2002, 2003, 2004 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Jeffrey M. Hsu. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 2002, 2003, 2004 Jeffrey M. Hsu. All rights reserved. + * + * License terms: all terms for the DragonFly license above plus the following: + * + * 4. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * + * This product includes software developed by Jeffrey M. Hsu + * for the DragonFly Project. + * + * This requirement may be waived with permission from Jeffrey Hsu. + * This requirement will sunset and may be removed on July 8 2005, + * after which the standard DragonFly license (as shown above) will + * apply. + */ + +/* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * @@ -127,11 +177,20 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); +static int tcp_do_early_retransmit = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, earlyretransmit, CTLFLAG_RW, + &tcp_do_early_retransmit, 0, "Early retransmit"); + static int tcp_do_rfc3390 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, &tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); +static int tcp_do_eifel_detect = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, eifel, CTLFLAG_RW, + &tcp_do_eifel_detect, 0, + "Eifel detection algorithm (RFC 3522)"); + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); @@ -436,9 +495,9 @@ struct ip6_hdr *ip6 = NULL; #ifdef INET6 - int isipv6; + boolean_t isipv6; #else - const int isipv6 = 0; + const boolean_t isipv6 = FALSE; #endif #ifdef TCPDEBUG @@ -452,7 +511,7 @@ #endif #ifdef INET6 - isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; + isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? TRUE : FALSE; #endif bzero(&tao, sizeof(tao)); bzero((char *)&to, sizeof(to)); @@ -615,7 +674,7 @@ /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); - if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ + if (fwd_tag != NULL && !isipv6) { /* IPv6 support is not yet */ struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); @@ -786,7 +845,7 @@ struct in_conninfo inc; #ifdef INET6 - inc.inc_isipv6 = isipv6; + inc.inc_isipv6 = (isipv6 == TRUE); #endif if (isipv6) { inc.inc6_faddr = ip6->ip6_src; @@ -1165,19 +1224,27 @@ ++tcpstat.tcps_predack; /* * "bad retransmit" recovery + * + * If Eifel detection applies, then + * it is deterministic, so use it + * unconditionally over the old heuristic + * Otherwise, fall back to the old heuristic. */ - if (tp->t_rxtshift == 1 && + if (tcp_do_eifel_detect && + (to.to_flags & TOF_TS) && to.to_tsecr && + (tp->t_flags & TF_FIRSTACCACK)) { + /* Eifel detection applicable. */ + if (to.to_tsecr < tp->t_rexmtTS) { + tcp_revert_congestion_state(tp); + ++tcpstat.tcps_eifeldetected; + } + } else if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { - ++tcpstat.tcps_sndrexmitbad; - tp->snd_cwnd = tp->snd_cwnd_prev; - tp->snd_ssthresh = - tp->snd_ssthresh_prev; - tp->snd_recover = tp->snd_recover_prev; - if (tp->t_flags & TF_WASFRECOVERY) - ENTER_FASTRECOVERY(tp); - tp->snd_nxt = tp->snd_max; - tp->t_badrxtwin = 0; + tcp_revert_congestion_state(tp); + ++tcpstat.tcps_rttdetected; } + tp->t_flags &= ~(TF_FIRSTACCACK | + TF_FASTREXMT | TF_EARLYREXMT); /* * Recalculate the transmit timer / rtt. @@ -1201,9 +1268,7 @@ tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); - if (SEQ_GT(tp->snd_una, tp->snd_recover) && - SEQ_LEQ(th->th_ack, tp->snd_recover)) - tp->snd_recover = th->th_ack - 1; + tp->snd_recover = th->th_ack - 1; tp->snd_una = th->th_ack; /* * pull snd_wl2 up to prevent seq wrap relative @@ -1947,7 +2012,7 @@ (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { - tcp_seq onxt = tp->snd_nxt; + tcp_seq onxt; u_int win; /* @@ -1969,6 +2034,12 @@ break; } } +fastretransmit: + if (tcp_do_eifel_detect && + (tp->t_flags & TF_RCVD_TSTMP)) { + tcp_save_congestion_state(tp); + tp->t_flags |= TF_FASTREXMT; + } win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) @@ -1978,6 +2049,7 @@ tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; + onxt = tp->snd_nxt; if (tp->sack_enable) { tcpstat.tcps_sack_recovery_episode++; tp->snd_cwnd = @@ -1992,6 +2064,7 @@ tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); + ++tcpstat.tcps_sndfastrexmit; KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big")); tp->snd_cwnd = tp->snd_ssthresh + @@ -2003,18 +2076,21 @@ } else if (tcp_do_rfc3042) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; + /* outstanding data */ + uint32_t ownd = tp->snd_max - tp->snd_una; u_int sent; +#define iceildiv(n, d) (((n)+(d)-1) / (d)) KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("dupacks not 1 or 2")); if (tp->t_dupacks == 1) tp->snd_limited = 0; - tp->snd_cwnd = - (tp->snd_nxt - tp->snd_una) + + tp->snd_cwnd = ownd + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; (void) tcp_output(tp); + tp->snd_cwnd = oldcwnd; sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && @@ -2022,9 +2098,24 @@ (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("sent too much")); + KASSERT(sent <= + tp->t_maxseg * 2, + ("sent too many segments")); tp->snd_limited = 2; - } else if (sent > 0) + tcpstat.tcps_sndlimited += 2; + } else if (sent > 0) { ++tp->snd_limited; + ++tcpstat.tcps_sndlimited; + } else if (tcp_do_early_retransmit && + (tcp_do_eifel_detect && + (tp->t_flags & TF_RCVD_TSTMP)) && + tcp_do_newreno && + tp->t_dupacks + 1 >= + iceildiv(ownd, tp->t_maxseg)) { + ++tcpstat.tcps_sndearlyrexmit; + tp->t_flags |= TF_EARLYREXMT; + goto fastretransmit; + } tp->snd_cwnd = oldcwnd; goto drop; } @@ -2073,6 +2164,12 @@ } tp->t_dupacks = 0; if (SEQ_GT(th->th_ack, tp->snd_max)) { + /* + * Detected optimistic ACK attack. + * Force slow-start to de-synchronize attack. + */ + tp->snd_cwnd = tp->t_maxseg; + tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } @@ -2114,15 +2211,20 @@ * original cwnd and ssthresh, and proceed to transmit where * we left off. */ - if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { - ++tcpstat.tcps_sndrexmitbad; - tp->snd_cwnd = tp->snd_cwnd_prev; - tp->snd_ssthresh = tp->snd_ssthresh_prev; - tp->snd_recover = tp->snd_recover_prev; - if (tp->t_flags & TF_WASFRECOVERY) - ENTER_FASTRECOVERY(tp); - tp->snd_nxt = tp->snd_max; - tp->t_badrxtwin = 0; /* XXX probably not required */ + if (tcp_do_eifel_detect && acked && + (to.to_flags & TOF_TS) && to.to_tsecr && + (tp->t_flags & TF_FIRSTACCACK)) { + /* Eifel detection applicable. */ + if (to.to_tsecr < tp->t_rexmtTS) { + ++tcpstat.tcps_eifeldetected; + tcp_revert_congestion_state(tp); + if (tp->t_rxtshift == 1 && + ticks >= tp->t_badrxtwin) + ++tcpstat.tcps_rttcantdetect; + } + } else if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { + tcp_revert_congestion_state(tp); + ++tcpstat.tcps_rttdetected; } /* @@ -2167,6 +2269,9 @@ if (acked == 0) goto step6; + /* Stop looking for an acceptable ACK since one was received. */ + tp->t_flags &= ~(TF_FIRSTACCACK | TF_FASTREXMT | TF_EARLYREXMT); + /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets @@ -2193,16 +2298,14 @@ ourfinisacked = 0; } sowwakeup_locked(so); - /* detect una wraparound */ - if ((tcp_do_newreno || tp->sack_enable) && - !IN_FASTRECOVERY(tp) && - SEQ_GT(tp->snd_una, tp->snd_recover) && - SEQ_LEQ(th->th_ack, tp->snd_recover)) - tp->snd_recover = th->th_ack - 1; - if ((tcp_do_newreno || tp->sack_enable) && - IN_FASTRECOVERY(tp) && - SEQ_GEQ(th->th_ack, tp->snd_recover)) - EXIT_FASTRECOVERY(tp); + if (tcp_do_newreno) { + if (IN_FASTRECOVERY(tp)) { + if (SEQ_GEQ(th->th_ack, tp->snd_recover)) + EXIT_FASTRECOVERY(tp); + } else { + tp->snd_recover = th->th_ack - 1; + } + } tp->snd_una = th->th_ack; if (tp->sack_enable) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) @@ -2897,7 +3000,7 @@ struct rmxp_tao tao; int origoffer = offer; #ifdef INET6 - int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + boolean_t isipv6 = ((inp->inp_vflag & INP_IPV6) ? TRUE : FALSE); size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); @@ -3148,7 +3251,7 @@ u_long thcmtu = 0; size_t min_protoh; #ifdef INET6 - int isipv6 = inc->inc_isipv6 ? 1 : 0; + boolean_t isipv6 = (inc->inc_isipv6 ? TRUE : FALSE); #endif KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); @@ -3227,7 +3330,7 @@ #ifdef INET6 int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #else - const int isipv6 = 0; + const boolean_t isipv6 = FALSE; #endif thflags = th->th_flags; Index: src/sys/netinet/tcp_timer.c diff -u src/sys/netinet/tcp_timer.c:1.66 src/sys/netinet/tcp_timer.c:1.64.1000.5 --- src/sys/netinet/tcp_timer.c:1.66 Tue Aug 17 02:32:07 2004 +++ src/sys/netinet/tcp_timer.c Tue Aug 17 10:13:43 2004 @@ -1,4 +1,54 @@ /* + * Copyright (c) 2002, 2003, 2004 Jeffrey M. Hsu. All rights reserved. + * Copyright (c) 2002, 2003, 2004 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Jeffrey M. Hsu. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 2002, 2003, 2004 Jeffrey M. Hsu. All rights reserved. + * + * License terms: all terms for the DragonFly license above plus the following: + * + * 4. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * + * This product includes software developed by Jeffrey M. Hsu + * for the DragonFly Project. + * + * This requirement may be waived with permission from Jeffrey Hsu. + * This requirement will sunset and may be removed on July 8 2005, + * after which the standard DragonFly license (as shown above) will + * apply. + */ + +/* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * @@ -469,6 +519,41 @@ } void +tcp_save_congestion_state(struct tcpcb *tp) +{ + tp->snd_cwnd_prev = tp->snd_cwnd; + tp->snd_ssthresh_prev = tp->snd_ssthresh; + tp->snd_recover_prev = tp->snd_recover; + if (IN_FASTRECOVERY(tp)) + tp->t_flags |= TF_WASFRECOVERY; + else + tp->t_flags &= ~TF_WASFRECOVERY; + if (tp->t_flags & TF_RCVD_TSTMP) { + tp->t_rexmtTS = ticks; + tp->t_flags |= TF_FIRSTACCACK; + } +} + +void +tcp_revert_congestion_state(struct tcpcb *tp) +{ + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_recover = tp->snd_recover_prev; + if (tp->t_flags & TF_WASFRECOVERY) + ENTER_FASTRECOVERY(tp); + if (tp->t_flags & TF_FASTREXMT) { + ++tcpstat.tcps_sndfastrexmitbad; + if (tp->t_flags & TF_EARLYREXMT) + ++tcpstat.tcps_sndearlyrexmitbad; + } else + ++tcpstat.tcps_sndrtobad; + tp->t_badrxtwin = 0; + tp->t_rxtshift = 0; + tp->snd_nxt = tp->snd_max; +} + +void tcp_timer_rexmt(xtp) void *xtp; { @@ -524,14 +609,9 @@ * "On Estimating End-to-End Network Path Properties" by * Allman and Paxson for more details. */ - tp->snd_cwnd_prev = tp->snd_cwnd; - tp->snd_ssthresh_prev = tp->snd_ssthresh; - tp->snd_recover_prev = tp->snd_recover; - if (IN_FASTRECOVERY(tp)) - tp->t_flags |= TF_WASFRECOVERY; - else - tp->t_flags &= ~TF_WASFRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + tcp_save_congestion_state(tp); + tp->t_flags &= ~(TF_FASTREXMT | TF_EARLYREXMT); } tcpstat.tcps_rexmttimeo++; if (tp->t_state == TCPS_SYN_SENT) Index: src/sys/netinet/tcp_var.h diff -u src/sys/netinet/tcp_var.h:1.109 src/sys/netinet/tcp_var.h:1.105.1000.9 --- src/sys/netinet/tcp_var.h:1.109 Tue Aug 17 02:32:07 2004 +++ src/sys/netinet/tcp_var.h Tue Aug 17 10:13:43 2004 @@ -1,4 +1,54 @@ /* + * Copyright (c) 2002, 2003, 2004 Jeffrey M. Hsu. All rights reserved. + * Copyright (c) 2002, 2003, 2004 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Jeffrey M. Hsu. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 2002, 2003, 2004 Jeffrey M. Hsu. All rights reserved. + * + * License terms: all terms for the DragonFly license above plus the following: + * + * 4. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * + * This product includes software developed by Jeffrey M. Hsu + * for the DragonFly Project. + * + * This requirement may be waived with permission from Jeffrey Hsu. + * This requirement will sunset and may be removed on July 8 2005, + * after which the standard DragonFly license (as shown above) will + * apply. + */ + +/* * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * @@ -90,29 +140,33 @@ struct inpcb *t_inpcb; /* back pointer to internet pcb */ int t_state; /* state of this connection */ u_int t_flags; -#define TF_ACKNOW 0x000001 /* ack peer immediately */ -#define TF_DELACK 0x000002 /* ack, but try to delay it */ -#define TF_NODELAY 0x000004 /* don't delay packets to coalesce */ -#define TF_NOOPT 0x000008 /* don't use tcp options */ -#define TF_SENTFIN 0x000010 /* have sent FIN */ -#define TF_REQ_SCALE 0x000020 /* have/will request window scaling */ -#define TF_RCVD_SCALE 0x000040 /* other side has requested scaling */ -#define TF_REQ_TSTMP 0x000080 /* have/will request timestamps */ -#define TF_RCVD_TSTMP 0x000100 /* a timestamp was received in SYN */ -#define TF_SACK_PERMIT 0x000200 /* other side said I could SACK */ -#define TF_NEEDSYN 0x000400 /* send SYN (implicit state) */ -#define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */ -#define TF_NOPUSH 0x001000 /* don't push */ -#define TF_REQ_CC 0x002000 /* have/will request CC */ -#define TF_RCVD_CC 0x004000 /* a CC was received in SYN */ -#define TF_SENDCCNEW 0x008000 /* send CCnew instead of CC in SYN */ -#define TF_MORETOCOME 0x010000 /* More data to be appended to sock */ -#define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */ -#define TF_LASTIDLE 0x040000 /* connection was previously idle */ -#define TF_RXWIN0SENT 0x080000 /* sent a receiver win 0 in response */ -#define TF_FASTRECOVERY 0x100000 /* in NewReno Fast Recovery */ -#define TF_WASFRECOVERY 0x200000 /* was in NewReno Fast Recovery */ -#define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ +#define TF_ACKNOW 0x00000001 /* ack peer immediately */ +#define TF_DELACK 0x00000002 /* ack, but try to delay it */ +#define TF_NODELAY 0x00000004 /* don't delay packets to coalesce */ +#define TF_NOOPT 0x00000008 /* don't use tcp options */ +#define TF_SENTFIN 0x00000010 /* have sent FIN */ +#define TF_REQ_SCALE 0x00000020 /* have/will request window scaling */ +#define TF_RCVD_SCALE 0x00000040 /* other side has requested scaling */ +#define TF_REQ_TSTMP 0x00000080 /* have/will request timestamps */ +#define TF_RCVD_TSTMP 0x00000100 /* a timestamp was received in SYN */ +#define TF_SACK_PERMIT 0x00000200 /* other side said I could SACK */ +#define TF_NEEDSYN 0x00000400 /* send SYN (implicit state) */ +#define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */ +#define TF_NOPUSH 0x00001000 /* don't push */ +#define TF_REQ_CC 0x00002000 /* have/will request CC */ +#define TF_RCVD_CC 0x00004000 /* a CC was received in SYN */ +#define TF_SENDCCNEW 0x00008000 /* send CCnew instead of CC in SYN */ +#define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */ +#define TF_LQ_OVERFLOW 0x00020000 /* listen queue overflow */ +#define TF_LASTIDLE 0x00040000 /* connection was previously idle */ +#define TF_RXWIN0SENT 0x00080000 /* sent a receiver win 0 in response */ +#define TF_FASTRECOVERY 0x00100000 /* in NewReno Fast Recovery */ +#define TF_WASFRECOVERY 0x00200000 /* was in NewReno Fast Recovery */ +#define TF_SIGNATURE 0x00400000 /* require MD5 digests (RFC2385) */ +#define TF_FIRSTACCACK 0x00800000 /* Look for 1st acceptable ACK. */ +#define TF_FASTREXMT 0x01000000 /* Did Fast Retransmit. */ +#define TF_EARLYREXMT 0x02000000 /* Did Early (Fast) Retransmit. */ + int t_force; /* 1 if forcing out a byte */ tcp_seq snd_una; /* send unacknowledged */ @@ -186,6 +240,7 @@ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ u_long t_badrxtwin; /* window for retransmit recovery */ + u_long t_rexmtTS; /* timestamp of last retransmit */ u_char snd_limited; /* segments limited transmitted */ /* anti DoS counters */ u_long rcv_second; /* start of interval second */ @@ -395,7 +450,16 @@ u_long tcps_sndbyte; /* data bytes sent */ u_long tcps_sndrexmitpack; /* data packets retransmitted */ u_long tcps_sndrexmitbyte; /* data bytes retransmitted */ - u_long tcps_sndrexmitbad; /* unnecessary packet retransmissions */ + u_long tcps_sndfastrexmit; /* Fast Retransmissions */ + u_long tcps_sndearlyrexmit; /* early Fast Retransmissions */ + u_long tcps_sndlimited; /* Limited Transmit packets */ + u_long tcps_sndrtobad; /* spurious RTO retransmissions */ + u_long tcps_sndfastrexmitbad; /* spurious Fast Retransmissions */ + u_long tcps_sndearlyrexmitbad; /* spurious early Fast Retransmissions, + a subset of tcps_sndfastrexmitbad */ + u_long tcps_eifeldetected; /* Eifel-detected spurious rexmits */ + u_long tcps_rttcantdetect; /* Eifel but not 1/2 RTT-detectable */ + u_long tcps_rttdetected; /* RTT-detected spurious RTO rexmits */ u_long tcps_sndacks; /* ack-only packets sent */ u_long tcps_sndprobe; /* window probes sent */ u_long tcps_sndurg; /* packets sent with URG only */ @@ -572,6 +636,8 @@ void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); int tcp_twrespond(struct tcptw *, int); +void tcp_save_congestion_state(struct tcpcb *tp); +void tcp_revert_congestion_state(struct tcpcb *tp); void tcp_setpersist(struct tcpcb *); #ifdef TCP_SIGNATURE int tcp_signature_compute(struct mbuf *, int, int, int, u_char *, u_int); Index: src/sys/sys/proc.h diff -u src/sys/sys/proc.h:1.392.2.9 src/sys/sys/proc.h:1.375.1000.22 --- src/sys/sys/proc.h:1.392.2.9 Sat Sep 18 12:11:35 2004 +++ src/sys/sys/proc.h Mon Sep 20 10:13:34 2004 @@ -88,7 +88,6 @@ * (c) const until freeing */ struct pgrp { - LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ @@ -699,7 +698,7 @@ #define SESSHOLD(s) ((s)->s_count++) #define SESSRELE(s) { \ if (--(s)->s_count == 0) \ - FREE(s, M_SESSION); \ + sessdelete(s); \ } #define STOPEVENT(p, e, v) do { \ @@ -773,18 +772,11 @@ #define PARGS_LOCK(p) mtx_lock(&pargs_ref_lock) #define PARGS_UNLOCK(p) mtx_unlock(&pargs_ref_lock) -#define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) -extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; -extern u_long pidhash; - -#define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) -extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; -extern u_long pgrphash; - extern struct sx allproc_lock; extern struct sx proctree_lock; extern struct mtx pargs_ref_lock; extern struct mtx ppeers_lock; +extern struct pgrp pgrp0; /* Process group for swapper. */ extern struct proc proc0; /* Process slot for swapper. */ extern struct thread thread0; /* Primary thread in proc0. */ extern struct ksegrp ksegrp0; /* Primary ksegrp in proc0. */ @@ -827,6 +819,11 @@ int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void mi_switch(int flags, struct thread *newtd); + +void sessdelete(struct session *); +struct proc *proc_alloc(struct thread *td, int flags); +void proc_free(struct proc *p); + int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); Index: src/usr.bin/netstat/inet.c diff -u src/usr.bin/netstat/inet.c:1.67 src/usr.bin/netstat/inet.c:1.65.1000.4 --- src/usr.bin/netstat/inet.c:1.67 Tue Jul 27 04:18:11 2004 +++ src/usr.bin/netstat/inet.c Tue Jul 27 13:10:32 2004 @@ -383,8 +383,14 @@ "\t\t%lu data packet%s (%lu byte%s)\n"); p2(tcps_sndrexmitpack, tcps_sndrexmitbyte, "\t\t%lu data packet%s (%lu byte%s) retransmitted\n"); - p(tcps_sndrexmitbad, - "\t\t%lu data packet%s unnecessarily retransmitted\n"); + p2a(tcps_sndfastrexmit, tcps_sndearlyrexmit, + "\t\t%lu Fast Retransmit%s (%lu early)\n"); + p(tcps_sndlimited, "\t\t%lu packet%s sent by Limited Transmit\n"); + p(tcps_sndrtobad, "\t\t%lu spurious RTO retransmit%s\n"); + p2a(tcps_sndfastrexmitbad, tcps_sndearlyrexmitbad, + "\t\t%lu spurious Fast Retransmit%s (%lu early)\n"); + p(tcps_eifeldetected, "\t\t%lu Eifel-detected spurious retransmit%s\n"); + p(tcps_rttdetected, "\t\t%lu RTT-detected spurious retransmit%s\n"); p(tcps_mturesent, "\t\t%lu resend%s initiated by MTU discovery\n"); p2a(tcps_sndacks, tcps_delack, "\t\t%lu ack-only packet%s (%lu delayed)\n"); Index: src/usr.sbin/inetd/inetd.c diff -u src/usr.sbin/inetd/inetd.c:1.127 src/usr.sbin/inetd/inetd.c:1.126.1000.2 --- src/usr.sbin/inetd/inetd.c:1.127 Sat Aug 7 12:27:50 2004 +++ src/usr.sbin/inetd/inetd.c Sat Aug 7 22:07:53 2004 @@ -113,6 +113,8 @@ #include #include #include +#include +#include #include #include @@ -202,6 +204,7 @@ #ifndef TOOMANY #define TOOMANY 256 /* don't start more than TOOMANY */ #endif + #define CNT_INTVL 60 /* servers in CNT_INTVL sec. */ #define RETRYTIME (60*10) /* retry after bind or server fail */ #define MAX_MAXCHLD 32767 /* max allowable max children */ @@ -209,8 +212,6 @@ #define SIGBLOCK (sigmask(SIGCHLD)|sigmask(SIGHUP)|sigmask(SIGALRM)) void close_sep(struct servtab *); -void flag_signal(int); -void flag_config(int); void config(void); int cpmip(const struct servtab *, int); void endconfig(void); @@ -220,11 +221,8 @@ int matchservent(const char *, const char *, const char *); char *nextline(FILE *); void addchild(struct servtab *, int); -void flag_reapchild(int); -void reapchild(void); void enable(struct servtab *); void disable(struct servtab *); -void flag_retry(int); void retry(void); int setconfig(void); void setup(struct servtab *); @@ -235,7 +233,6 @@ static struct conninfo *search_conn(struct servtab *sep, int ctrl); static int room_conn(struct servtab *sep, struct conninfo *conn); static void addchild_conn(struct conninfo *conn, pid_t pid); -static void reapchild_conn(pid_t pid); static void free_conn(struct conninfo *conn); static void resize_conn(struct servtab *sep, int maxperip); static void free_connlist(struct servtab *sep); @@ -250,7 +247,7 @@ int debug = 0; int dolog = 0; int maxsock; /* highest-numbered descriptor */ -fd_set allsock; +int kqsock; int options; int timingout; int toomany = TOOMANY; @@ -266,7 +263,6 @@ struct sockaddr_in6 *bind_sa6; int v6bind_ok = 0; #endif -int signalpipe[2]; #ifdef SANITY_CHECK int nsock; #endif @@ -317,6 +313,7 @@ int main(int argc, char **argv) { + struct kevent kqevlist[16]; struct servtab *sep; struct passwd *pwd; struct group *grp; @@ -340,7 +337,11 @@ #define peer4 p_un.peer_un4 #define peer6 p_un.peer_un6 #define peermax p_un.peer_max - int i; + int i, j; +#ifdef SANITY_CHECK + int k; +#endif + int status; struct addrinfo hints, *res; const char *servname; int error; @@ -517,19 +518,19 @@ } #endif + kqsock = kqueue(); + sa.sa_flags = 0; sigemptyset(&sa.sa_mask); sigaddset(&sa.sa_mask, SIGALRM); sigaddset(&sa.sa_mask, SIGCHLD); sigaddset(&sa.sa_mask, SIGHUP); - sa.sa_handler = flag_retry; + sa.sa_handler = SIG_IGN; sigaction(SIGALRM, &sa, &saalrm); + WATCH_SIG(SIGALRM, retry); config(); - sa.sa_handler = flag_config; sigaction(SIGHUP, &sa, &sahup); - sa.sa_handler = flag_reapchild; - sigaction(SIGCHLD, &sa, &sachld); - sa.sa_handler = SIG_IGN; + WATCH_SIG(SIGHUP, config); sigaction(SIGPIPE, &sa, &sapipe); { @@ -542,27 +543,8 @@ (void)setenv("inetd_dummy", dummy, 1); } - if (pipe(signalpipe) != 0) { - syslog(LOG_ERR, "pipe: %m"); - exit(EX_OSERR); - } - if (fcntl(signalpipe[0], F_SETFD, FD_CLOEXEC) < 0 || - fcntl(signalpipe[1], F_SETFD, FD_CLOEXEC) < 0) { - syslog(LOG_ERR, "signalpipe: fcntl (F_SETFD, FD_CLOEXEC): %m"); - exit(EX_OSERR); - } - FD_SET(signalpipe[0], &allsock); -#ifdef SANITY_CHECK - nsock++; -#endif - if (signalpipe[0] > maxsock) - maxsock = signalpipe[0]; - if (signalpipe[1] > maxsock) - maxsock = signalpipe[1]; - for (;;) { int n, ctrl; - fd_set readable; #ifdef SANITY_CHECK if (nsock == 0) { @@ -570,46 +552,60 @@ exit(EX_SOFTWARE); } #endif - readable = allsock; - if ((n = select(maxsock + 1, &readable, (fd_set *)0, - (fd_set *)0, (struct timeval *)0)) <= 0) { - if (n < 0 && errno != EINTR) { - syslog(LOG_WARNING, "select: %m"); + + if ((n = kevent(kqsock, NULL, 0, kqevlist, + sizeof kqevlist / sizeof *kqevlist, + (struct timespec *)0)) <= 0) { + if (n == -1 && errno != EINTR) { + syslog(LOG_WARNING, "kevent: %m"); sleep(1); } continue; } - /* handle any queued signal flags */ - if (FD_ISSET(signalpipe[0], &readable)) { - int nsig; - if (ioctl(signalpipe[0], FIONREAD, &nsig) != 0) { - syslog(LOG_ERR, "ioctl: %m"); - exit(EX_OSERR); - } - while (--nsig >= 0) { - char c; - if (read(signalpipe[0], &c, 1) != 1) { - syslog(LOG_ERR, "read: %m"); - exit(EX_OSERR); - } + + for (j = 0; j < n; j++) { + if (kqevlist[j].filter == EVFILT_SIGNAL) { + /* handle any queued signal flags */ if (debug) - warnx("handling signal flag %c", c); - switch(c) { - case 'A': /* sigalrm */ - retry(); - break; - case 'C': /* sigchld */ - reapchild(); - break; - case 'H': /* sighup */ - config(); - break; - } - } - } - for (sep = servtab; n && sep; sep = sep->se_next) - if (sep->se_fd != -1 && FD_ISSET(sep->se_fd, &readable)) { - n--; + warnx("calling signalhandler for sig %d", + kqevlist[j].ident); + ((void (*)())kqevlist[j].udata)(); + } else if (kqevlist[j].filter == EVFILT_PROC) { + sep = (struct servtab *)kqevlist[j].udata; + pid = wait4(kqevlist[j].ident, &status, WNOHANG, + (struct rusage *)0); + if (debug) + warnx("%d reaped, status %#x", pid, status); + if (pid == 0) { + /* XXX - this could leave a zombie */ + syslog(LOG_WARNING, "can't reap pid %d", + kqevlist[j].ident); + continue; + } +#ifdef SANITY_CHECK + for (k = 0; k < sep->se_numchild; k++) + if (sep->se_pids[k] == pid) + break; + if (k != sep->se_numchild) + sep->se_pids[k] = + sep->se_pids[sep->se_numchild - 1]; +#endif + if (sep->se_maxchild && + sep->se_numchild == sep->se_maxchild) + enable(sep); + if (status) + syslog(LOG_WARNING, + "%s[%d]: exit status 0x%x", + sep->se_server, pid, status); + /* XXX - this should never happen */ + if (--sep->se_numchild < 0) + sep->se_numchild = 0; + if (sep->se_free && sep->se_numchild == 0) { + freeconfig(sep); + free((char *)sep); + } + } else { + sep = (struct servtab *)kqevlist[j].udata; if (debug) warnx("someone wants %s", sep->se_service); dofork = !sep->se_bi || sep->se_bi->bi_fork || ISWRAP(sep); @@ -883,21 +879,7 @@ if (sep->se_accept && sep->se_socktype == SOCK_STREAM) close(ctrl); } - } -} - -/* - * Add a signal flag to the signal flag queue for later handling - */ - -void -flag_signal(int c) -{ - char ch = c; - - if (write(signalpipe[1], &ch, 1) != 1) { - syslog(LOG_ERR, "write: %m"); - _exit(EX_OSERR); + } } } @@ -909,72 +891,18 @@ void addchild(struct servtab *sep, pid_t pid) { - if (sep->se_maxchild <= 0) - return; #ifdef SANITY_CHECK - if (sep->se_numchild >= sep->se_maxchild) { + if (sep->se_maxchild && sep->se_numchild >= sep->se_maxchild) { syslog(LOG_ERR, "%s: %d >= %d", __FUNCTION__, sep->se_numchild, sep->se_maxchild); exit(EX_SOFTWARE); } + sep->se_pids[sep->se_numchild] = pid; #endif - sep->se_pids[sep->se_numchild++] = pid; - if (sep->se_numchild == sep->se_maxchild) + sep->se_numchild++; + if (sep->se_maxchild && sep->se_numchild == sep->se_maxchild) disable(sep); -} - -/* - * Some child process has exited. See if it's on somebody's list. - */ - -void -flag_reapchild(int signo __unused) -{ - flag_signal('C'); -} - -void -reapchild(void) -{ - int k, status; - pid_t pid; - struct servtab *sep; - - for (;;) { - pid = wait3(&status, WNOHANG, (struct rusage *)0); - if (pid <= 0) - break; - if (debug) - warnx("%d reaped, %s %u", pid, - WIFEXITED(status) ? "status" : "signal", - WIFEXITED(status) ? WEXITSTATUS(status) - : WTERMSIG(status)); - for (sep = servtab; sep; sep = sep->se_next) { - for (k = 0; k < sep->se_numchild; k++) - if (sep->se_pids[k] == pid) - break; - if (k == sep->se_numchild) - continue; - if (sep->se_numchild == sep->se_maxchild) - enable(sep); - sep->se_pids[k] = sep->se_pids[--sep->se_numchild]; - if (WIFSIGNALED(status) || WEXITSTATUS(status)) - syslog(LOG_WARNING, - "%s[%d]: exited, %s %u", - sep->se_server, pid, - WIFEXITED(status) ? "status" : "signal", - WIFEXITED(status) ? WEXITSTATUS(status) - : WTERMSIG(status)); - break; - } - reapchild_conn(pid); - } -} - -void -flag_config(int signo __unused) -{ - flag_signal('H'); + WATCH_PROC(pid, sep); } void @@ -991,8 +919,10 @@ syslog(LOG_ERR, "%s: %m", CONFIG); return; } - for (sep = servtab; sep; sep = sep->se_next) + + for (sep = servtab; sep != NULL; sep = sep->se_next) sep->se_checked = 0; + while ((new = getconfigent())) { if (getpwnam(new->se_user) == NULL) { syslog(LOG_ERR, @@ -1038,12 +968,17 @@ /* copy over outstanding child pids */ if (sep->se_maxchild > 0 && new->se_maxchild > 0) { new->se_numchild = sep->se_numchild; + /* XXX - this can cause problems */ if (new->se_numchild > new->se_maxchild) new->se_numchild = new->se_maxchild; +#ifdef SANITY_CHECK memcpy(new->se_pids, sep->se_pids, new->se_numchild * sizeof(*new->se_pids)); +#endif } +#ifdef SANITY_CHECK SWAP(pid_t *, sep->se_pids, new->se_pids); +#endif sep->se_maxchild = new->se_maxchild; sep->se_numchild = new->se_numchild; sep->se_maxcpm = new->se_maxcpm; @@ -1052,14 +987,11 @@ sep->se_bi = new->se_bi; /* might need to turn on or off service now */ if (sep->se_fd >= 0) { - if (sep->se_maxchild > 0 - && sep->se_numchild == sep->se_maxchild) { - if (FD_ISSET(sep->se_fd, &allsock)) - disable(sep); - } else { - if (!FD_ISSET(sep->se_fd, &allsock)) - enable(sep); - } + if (sep->se_maxchild + && sep->se_numchild == sep->se_maxchild) + disable(sep); + else + enable(sep); } sep->se_accept = new->se_accept; SWAP(char *, sep->se_user, new->se_user); @@ -1179,8 +1111,11 @@ print_service("FREE", sep); if (sep->se_rpc && sep->se_rpc_prog > 0) unregisterrpc(sep); - freeconfig(sep); - free(sep); + if (sep->se_numchild == 0) { + freeconfig(sep); + free((char *)sep); + } else + sep->se_free = 1; } (void) sigsetmask(omask); } @@ -1241,12 +1176,6 @@ } void -flag_retry(int signo __unused) -{ - flag_signal('A'); -} - -void retry(void) { struct servtab *sep; @@ -1280,12 +1209,12 @@ #define turnon(fd, opt) \ setsockopt(fd, SOL_SOCKET, opt, (char *)&on, sizeof (on)) if (strcmp(sep->se_proto, "tcp") == 0 && (options & SO_DEBUG) && - turnon(sep->se_fd, SO_DEBUG) < 0) + turnon(sep->se_fd, SO_DEBUG) == -1) syslog(LOG_ERR, "setsockopt (SO_DEBUG): %m"); - if (turnon(sep->se_fd, SO_REUSEADDR) < 0) + if (turnon(sep->se_fd, SO_REUSEADDR) == -1) syslog(LOG_ERR, "setsockopt (SO_REUSEADDR): %m"); #ifdef SO_PRIVSTATE - if (turnon(sep->se_fd, SO_PRIVSTATE) < 0) + if (turnon(sep->se_fd, SO_PRIVSTATE) == -1) syslog(LOG_ERR, "setsockopt (SO_PRIVSTATE): %m"); #endif /* tftpd opens a new connection then needs more infos */ @@ -1293,7 +1222,7 @@ (strcmp(sep->se_proto, "udp") == 0) && (sep->se_accept == 0) && (setsockopt(sep->se_fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, - (char *)&on, sizeof (on)) < 0)) + (char *)&on, sizeof (on)) == -1)) syslog(LOG_ERR, "setsockopt (IPV6_RECVPKTINFO): %m"); if (sep->se_family == AF_INET6) { int flag = sep->se_nomapped ? 1 : 0; @@ -1476,8 +1405,7 @@ close_sep(struct servtab *sep) { if (sep->se_fd >= 0) { - if (FD_ISSET(sep->se_fd, &allsock)) - disable(sep); + disable(sep); (void) close(sep->se_fd); sep->se_fd = -1; } @@ -1516,7 +1444,7 @@ long omask; sep = (struct servtab *)malloc(sizeof (*sep)); - if (sep == (struct servtab *)0) { + if (sep == NULL) { syslog(LOG_ERR, "malloc: %m"); exit(EX_OSERR); } @@ -1546,14 +1474,9 @@ "%s: %s: is mux", __FUNCTION__, sep->se_service); exit(EX_SOFTWARE); } - if (FD_ISSET(sep->se_fd, &allsock)) { - syslog(LOG_ERR, - "%s: %s: not off", __FUNCTION__, sep->se_service); - exit(EX_SOFTWARE); - } nsock++; #endif - FD_SET(sep->se_fd, &allsock); + WATCH_SOCK(sep->se_fd, sep); if (sep->se_fd > maxsock) maxsock = sep->se_fd; } @@ -1575,18 +1498,13 @@ "%s: %s: is mux", __FUNCTION__, sep->se_service); exit(EX_SOFTWARE); } - if (!FD_ISSET(sep->se_fd, &allsock)) { - syslog(LOG_ERR, - "%s: %s: not on", __FUNCTION__, sep->se_service); - exit(EX_SOFTWARE); - } if (nsock == 0) { syslog(LOG_ERR, "%s: nsock=0", __FUNCTION__); exit(EX_SOFTWARE); } nsock--; #endif - FD_CLR(sep->se_fd, &allsock); + UNWATCH_SOCK(sep->se_fd, sep); if (sep->se_fd == maxsock) maxsock--; } @@ -1971,6 +1889,7 @@ else sep->se_maxchild = 1; } +#ifdef SANITY_CHECK if (sep->se_maxchild > 0) { sep->se_pids = malloc(sep->se_maxchild * sizeof(*sep->se_pids)); if (sep->se_pids == NULL) { @@ -1978,6 +1897,7 @@ exit(EX_OSERR); } } +#endif argc = 0; for (arg = skip(&cp); cp; arg = skip(&cp)) if (argc < MAXARGV) { @@ -2017,8 +1937,10 @@ #endif if (cp->se_server) free(cp->se_server); +#ifdef SANITY_CHECK if (cp->se_pids) free(cp->se_pids); +#endif for (i = 0; i < MAXARGV; i++) if (cp->se_argv[i]) free(cp->se_argv[i]); @@ -2357,6 +2279,36 @@ return(r); } +void +watch(filter, ident, data, fflags, addrm) + short filter; + uintptr_t ident; + void *data; + u_int fflags; + int addrm; +{ + struct kevent kev; + struct kevent *kptr; + int i; + + kptr = &kev; + + kev.ident = ident; + kev.filter = filter; + kev.flags = addrm ? EV_ADD|EV_ENABLE : EV_DELETE|EV_DISABLE; + kev.fflags = fflags; + kev.udata = data; + + i = kevent(kqsock, kptr, 1, NULL, 0, NULL); + + if (i == -1) + syslog(LOG_ERR, "kevent failed: %m"); + + if (debug) { + warnx("kqueue, ident: %d, addrm: %d, ret: %d, data: %p, errno: %s", ident, addrm, i, data, strerror(errno)); + } +} + static struct conninfo * search_conn(struct servtab *sep, int ctrl) { @@ -2468,26 +2420,6 @@ } static void -reapchild_conn(pid_t pid) -{ - struct procinfo *proc; - struct conninfo *conn; - int i; - - if ((proc = search_proc(pid, 0)) == NULL) - return; - if ((conn = proc->pr_conn) == NULL) - return; - for (i = 0; i < conn->co_numchild; ++i) - if (conn->co_proc[i] == proc) { - conn->co_proc[i] = conn->co_proc[--conn->co_numchild]; - break; - } - free_proc(proc); - free_conn(conn); -} - -static void resize_conn(struct servtab *sep, int maxpip) { struct conninfo *conn; Index: src/usr.sbin/inetd/inetd.h diff -u src/usr.sbin/inetd/inetd.h:1.14 src/usr.sbin/inetd/inetd.h:1.13.1000.2 --- src/usr.sbin/inetd/inetd.h:1.14 Sat Aug 7 12:27:50 2004 +++ src/usr.sbin/inetd/inetd.h Sat Aug 7 22:07:53 2004 @@ -74,7 +74,10 @@ int se_maxchild; /* max number of children */ int se_maxcpm; /* max connects per IP per minute */ int se_numchild; /* current number of children */ + int se_free; /* free when numchild == 0 */ +#ifdef SANITY_CHECK pid_t *se_pids; /* array of child pids */ +#endif char *se_user; /* user name to run as */ char *se_group; /* group name to run as */ #ifdef LOGIN_CAP @@ -145,3 +148,14 @@ int bi_maxchild; /* max number of children, -1=default */ bi_fn_t *bi_fn; /* function which performs it */ }; + +void watch __P((short, uintptr_t, void *, u_int, int)); +#define WATCH_SOCK(fd, data) watch(EVFILT_READ, fd, data, 0, 1) +#define UNWATCH_SOCK(fd, data) watch(EVFILT_READ, fd, data, 0, 0) +#define WATCH_SIG(sig, data) watch(EVFILT_SIGNAL, sig, data, 0, 1) +#define UNWATCH_SIG(sig, data) watch(EVFILT_SIGNAL, sig, data, 0, 0) +#define WATCH_PROC(proc, data) watch(EVFILT_PROC, proc, data, NOTE_EXIT, 1) +#define UNWATCH_PROC(proc, data) watch(EVFILT_PROC, proc, data, NOTE_EXIT, 0) +#define WATCH_FD(fd, data) watch(EVFILT_VNODE, fd, data, NOTE_DELETE|NOTE_WRITE|NOTE_EXTEND|NOTE_RENAME, 1) +#define UNWATCH_FD(fd, data) watch(EVFILT_VNODE, fd, data, NOTE_DELETE|NOTE_WRITE|NOTE_EXTEND|NOTE_RENAME, 0) +